Exemple #1
0
    def children_pull(self,
                      existing_caches=tuple(),
                      only=tuple(),
                      skip=tuple()):
        # FIXME this is really a recursive pull for organization level only ...
        sname = lambda gen: sorted(gen, key=lambda c: c.name)

        def refresh(c):
            updated = c.meta.updated
            newc = c.refresh()
            if newc is None:
                return

            nupdated = newc.meta.updated
            if nupdated != updated:
                return newc

        existing = sname(existing_caches)
        if not self._debug:
            skipexisting = {
                e.id: e
                for e in Async(rate=self._async_rate)(
                    deferred(refresh)(e) for e in existing) if e is not None
            }
        else:  # debug ...
            skipexisting = {
                e.id: e
                for e in (refresh(e) for e in existing) if e is not None
            }

        # FIXME
        # in theory the remote could change betwee these two loops
        # since we currently cannot do a single atomic pull for
        # a set of remotes and have them refresh existing files
        # in one shot

        if not self._debug:
            yield from (
                rc for d in Async(rate=self._async_rate)(
                    deferred(child.bootstrap)(
                        recursive=True, only=only, skip=skip)
                    for child in sname(self.children)
                    #if child.id in skipexisting
                    # TODO when dataset's have a 'anything in me updated'
                    # field then we can use that to skip things that haven't
                    # changed (hello git ...)
                ) for rc in d)
        else:  # debug
            yield from (
                rc
                for d in (child.bootstrap(recursive=True, only=only, skip=skip)
                          for child in sname(self.children))
                #if child.id in skipexisting
                # TODO when dataset's have a 'anything in me updated'
                # field then we can use that to skip things that haven't
                # changed (hello git ...)
                for rc in d)
Exemple #2
0
    def test_mbf_header(self):
        test_id = 'N:dataset:bec4d335-9377-4863-9017-ecd01170f354'
        test_dataset = [d.cache for d in self.test_datasets if d.cache.id == test_id][0]
        if not list(test_dataset.local.children):
            rchilds = list(test_dataset.rchildren)
            xmls = [c for c in rchilds if c.suffix == '.xml']
            Async(rate=5)(deferred(x.fetch)() for x in xmls if not x.exists())
            #[x.fetch() for x in xmls if not x.exists()]
            local_xmls = [x.local for x in xmls]
        else:
            local_xmls = list(test_dataset.local.rglob('*.xml'))
            if any(p for p in local_xmls if not p.exists()):
                raise BaseException('unfetched children')

        embfs = [exml.ExtractXml(x) for x in local_xmls]
        d = embfs[0].asDict()
        blob = [e.asDict() for e in embfs]
        errors = [b.pop('errors') for b in blob if 'errors' in b]
        error_types = set(e['validator'] for es in errors for e in es)
        if export:
            with open('mbf-test.json', 'wt') as f:
                json.dump(blob, f, indent=2, cls=JEncode)
            with open('mbf-errors.json', 'wt') as f:
                json.dump(errors, f, indent=2, cls=JEncode)

        assert error_types == {'not'} or not error_types, f'unexpected error type! {error_types}'
def zap_deleted(get_annos):
    annos = get_annos()
    new_annos = get_annos.get_annos_from_api(len(annos), 200)
    n_deleted = len([a for a in new_annos if a in annos])
    print('there are', n_deleted, 'potentially deleted annotations')
    missing = []
    h = get_annos.h()

    def thing(id):
        return id, h.head_annotation(id).ok

    # work backwards to cull deleted annotations
    size = 500
    n_chunks = len(annos) // size
    for i, anno_chunk in enumerate(chunk_list(list(reversed(annos)), size)):
        if i < 10:
            continue
        print('chunk size', size, 'number', i + 1 , 'of', n_chunks, 'found', len(missing))
        if len(missing) >= n_deleted:
            break
        responses = Async(25)(deferred(thing)(a.id) for a in anno_chunk)
        missing += [id for id, ok in responses if not ok]

    # TODO actually remove them
    embed()
Exemple #4
0
def catalog_extras(fetch=False):
    path = Path(auth.get_path('ontology-local-repo'), 'ttl')
    cat = (path / 'catalog-v001.xml').as_posix()
    with open((path / '../catalog-extras').as_posix(),
              'rt') as ce, open(cat, 'rt') as c:
        clines = c.readlines()
        celines = ce.readlines()

    if clines[-2] != celines[-1]:
        with open(cat, 'wt') as f:
            f.writelines(clines[:-1] + celines + clines[-1:])
    else:
        print(tc.blue('INFO:'),
              'extras already added to catalog doing nothing')

    if fetch:
        print(tc.blue('INFO:'), 'fetching extras')

        def fetch_and_save(url, loc):
            resp = requests.get(url)
            saveloc = (path / loc).as_posix()
            if resp.ok:
                with open(saveloc, 'wb') as f:
                    f.write(resp.content)

                print(tc.blue('INFO:'), f'{url:<60} written to {loc}')
            else:
                print(tc.red('WARNING:'), f'failed to fetch {url}')

        Async()(deferred(fetch_and_save)(url, loc) for line in celines
                for _, _, _, url, _, loc, _ in (line.split('"'), ))
    def boost(func: Callable,
              kwargs_list: List[dict],
              batch_size: int = 3,
              rate: int = None) -> iter:
        """ Async boost for Function/Method & list of kwarg params for Function/Method.

        :param func: Function/Method to be asynchronously called.
        :param kwargs_list: Function/Method perameters for each call.
        :param batch_size: Batch size. Default 3
        :param rate: Inner batch size. Auto set to max possible.
        :returns: Generator of repsonses from func.

        >>>from ontquery.plugins.services.interlex_client import InterLexClient
        >>>ilx_cli = InterLexClient(base_url='https://test3.scicrunch.org/api/1/')
        >>>kwargs_list = [{'label': 'Label 1', 'type': 'term'}, {'label': 'Label 2', 'type': 'term'}]
        >>>self.boost(ilx_cli.add_entity, kwargs_list)
        """
        # InterLex specific batch size range #
        if batch_size > 25:
            batch_size = 25  # trust me; this is MAX. Anymore freaks out the php workers.
        if batch_size < 3:
            batch_size = 3  # Any less than 3 and async isn't worth it.
        # Worker #
        gin = lambda kwargs: func(**kwargs)
        # Builds futures dynamically #
        results = []
        for step in range(0, len(kwargs_list), batch_size):
            print('Step ->', step)  # So you can restart from last step.
            results += Async(rate=rate)(deferred(gin)(kwargs)
                                        for kwargs in kwargs_list[step:step +
                                                                  batch_size])
        return results
Exemple #6
0
    def triples(self):
        self.iri = rdflib.URIRef(f'https://apinatomy.org/uris/models/{self.id}')
        yield self.iri, rdf.type, readable.Graph
        yield self.iri, readable.name, rdflib.Literal(self.name)
        yield self.iri, readable.abbreviation, rdflib.Literal(self.abbreviation)
        externals = []
        for id, blob in self.resources.items():
            if 'class' not in blob:
                logd.warning(f'no class in\n{blob!r} for {id}')
                continue
            elif blob['class'] == 'Graph':
                continue

            obj = getattr(self, blob['class'])(blob, self.context, self.label_suffix)

            if blob['class'] == 'External':
                # defer lookup
                externals.append(obj)
                continue

            yield from obj.triples()

        Async()(deferred(lambda x: x._term)(e) for e in externals)
        for e in externals:
            yield from e.triples()
Exemple #7
0
    def export_identifier_metadata(self, dump_path, dataset_blobs):

        if (self.latest and self.latest_id_met_path.exists()):
            blob_id_met = self.latest_id_met

        else:

            def fetch(id):  # FIXME error proof version ...
                try:
                    metadata = id.metadata()
                    metadata['id'] = id.identifier  # FIXME normalization ...
                    return metadata
                except requests.exceptions.HTTPError as e:
                    logd.error(e)
                except (requests.exceptions.ConnectionError,
                        requests.exceptions.SSLError) as e:
                    log.error(e)

            # retrieve doi metadata and materialize it in the dataset
            _dois = set([
                idlib.Auto(id) if not isinstance(id, idlib.Stream) else id
                for blob in dataset_blobs for id in chain(
                    adops.get(blob, ['meta', 'protocol_url_or_doi'],
                              on_failure=[]),
                    adops.get(blob, ['meta', 'originating_article_doi'],
                              on_failure=[]),
                    # TODO data["links"]?
                    [blob['meta']['doi']]) if id is not None
            ])

            dois = [d for d in _dois if isinstance(d, idlib.Doi)]
            metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois)
            bads = [
                {
                    'id': d,
                    'reason': 'no metadata'
                }  # TODO more granular reporting e.g. 404
                for d, m in zip(dois, metadatas) if m is None
            ]
            metadatas = [m for m in metadatas if m is not None]
            blob_id_met = {
                'id': 'identifier-metadata',  # TODO is this ok ?
                'identifier_metadata': metadatas,
                'errors': bads,
                'meta': {
                    'count': len(metadatas)
                },
                'prov': {
                    'timestamp_export_start': self.timestamp,
                    'export_system_identifier': Path.sysid,
                    'export_hostname': gethostname(),
                    'export_project_path':
                    self.export_source_path.cache.anchor,
                },
            }

        with open(dump_path / 'identifier-metadata.json', 'wt') as f:
            json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode)

        return blob_id_met
Exemple #8
0
    def searchSquares(squares):
        def fetch(s):
            return s, list(query(label=s.label))

        return {
            s: match
            for s, match in Async(rate=10)(deferred(fetch)(s) for s in squares)
        }
Exemple #9
0
 def main():
     terms = Terms(readonly=False)
     sgd = Dynamic(cache=True)
     ol = sgd.prod_sparc_organList()
     ol['nodes']
     ids = [n['id'] for n in ol['nodes']]
     res = Async()(deferred(by_organ)(i, sgd) for i in ids)
     nodes = [n for o, r in res for n in r['nodes']]
     rows = [(o, n['id'], n['lbl'], '\n'.join(syn(n)), defn(n))
             for o, r in res for n in r['nodes']]
     terms.upsert(*rows)
     terms.commit()
Exemple #10
0
 def loadData(cls):
     """ corresponds to the list of FMA ids from organParts
         for all organs in the sparc organsList """
     g = OntGraph()
     g.namespace_manager.populate_from(uPREFIXES)  # cls._ghead except fma doesn't define FMA:
     ol = cls.sgd.prod_sparc_organList()
     top_ids = [n['id'] for n in ol['nodes']]
     res = Async()(deferred(by_organ)(i, cls.sgd) for i in top_ids)
     #res = [by_organ(i, cls.sgd) for i in top_ids]
     #res_stats(res)  # why are there dupes? now we know!
     nodes = [n for o, r in res for n in r['nodes']]
     ids_raw = set(n['id'] for n in nodes if not n['id'].startswith('_:') and n['id'] != 'owl:Nothing')
     ids = set(g.namespace_manager.expand(id).toPython() for id in ids_raw)
     return ids_raw, ids
Exemple #11
0
 def get_itrips(self):
     results = self.get_scigraph_onts()
     iris = sorted(set(r['iri'] for r in results))
     gin = lambda i: (i,
                      self.sgg.getNeighbors(i,
                                            relationshipType='isDefinedBy',
                                            direction='OUTGOING'))
     nodes = Async()(deferred(gin)(i) for i in iris)
     imports = [(i, *[(e['obj'], 'owl:imports', e['sub'])
                      for e in n['edges']]) for i, n in nodes if n]
     self.itrips = sorted(
         set(
             tuple(rdflib.URIRef(OntId(e).iri) for e in t)
             for i, *ts in imports if ts for t in ts))
     return self.itrips
Exemple #12
0
 def make_rt(to_review_tuples, retired=retired):
     def inner(u, l, retired=retired):
         ne = sgg.getNeighbors(u, relationshipType="isDefinedBy", depth=1)
         if ne:
             curie = help_graph.qname(u)
             help_graph.g.add((URIRef(u), ilxtr.SciGraphLookup, URIRef(f'http://scigraph.olympiangods.org/scigraph/graph/{curie}')))
         if ne and ne['edges']:
             src = ' '.join([f'<{e["obj"]}>' for e in ne["edges"]])
         elif u in retired:
             src = retfile
         else:
             src = '<>'
         return f'{u:<70} {l:<50} {src}'
     out = Async(rate=3000)(deferred(inner)(u, l) for u, l in sorted(to_review_tuples, key=lambda a:a[-1]))
     return '\n'.join(out)
Exemple #13
0
    def counts(self):
        if not hasattr(self, '_counts'):
            size = 0
            dirs = 0
            files = 0
            need_meta = []
            if not self.is_dir():
                gen = self,

            else:
                gen = self.rchildren

            for c in gen:
                if c.is_dir():
                    dirs += 1
                else:
                    files += 1  # testing for broken symlinks is hard
                    try:
                        maybe_size = c.cache.meta.size
                    except AttributeError as e:
                        log.error(f'no cache or no meta for {c}\n{e}')
                        continue

                    if maybe_size is None:
                        need_meta.append(c)
                    else:
                        size += maybe_size

            if need_meta and self._refresh_on_missing:
                nl = '\n'
                log.info(
                    f'refreshing {len(need_meta)} files with missing metadata in {self}'
                    f'\n{nl.join(_.as_posix() for _ in need_meta)}')
                new_caches = Async(rate=self.rate)(deferred(c.cache.refresh)()
                                                   for c in need_meta)
                for c in new_caches:  # FIXME first time around meta doesn't get updated ??
                    if c is None:
                        continue  # file was deleted (logged previously)

                    if c.meta is None:
                        log.critical(f'missing metdata! {c}')
                        continue

                    size += c.meta.size

            self._counts = dict(size=FileSize(size), dirs=dirs, files=files)

        return self._counts
Exemple #14
0
def getOnts():
    # generate everything from these two so that they stay up to date
    # http://help.brain-map.org/display/api/Atlas+Drawings+and+Ontologies
    func = lambda url: requests.get(url).json()['msg']
    query = 'http://api.brain-map.org/api/v2/data/query.json?criteria=model::{model}'
    models = 'Atlas', 'Ontology', 'ReferenceSpace'
    res = Async(rate=10)(deferred(func)(query.format(model=model))
                         for model in models)

    _Atlas, _Ontology, _ReferenceSpace = res

    # FIXME looks like this API  changed

    onts = {o['id']: o for o in _Ontology}
    refs = {r['id']: r for r in _ReferenceSpace}
    refs[None] = None
    onts[None] = None
    want_onts = set()
    # ontology metadata
    for at in _Atlas:
        at['name']
        at['description']
        ref = refs[at['reference_space_id']]
        try:
            ont = onts[at['structure_graph_id']]
        except KeyError as e:
            ont = dict(id=at['structure_graph_id'], organism_id=2)
            print('hey guys, could you please fix this missing ont?', e)
        if ont:
            want_onts.add(ont['id'])
        if ont and ref:
            assert ont['organism_id'] == ref[
                'organism_id'], f"\n{ont['organism_id']}\n{ref['organism_id']}"

    have_atlases = set(o['id'] for o in onts.values() if o and o['has_atlas'])
    try:
        assert want_onts == have_atlases, f'\n{sorted(want_onts)}\n{sorted(have_atlases)}'
    except AssertionError as e:
        print('needs more attention', e)
    for oid in want_onts:
        try:
            ont = onts[oid]
        except KeyError:  # FIXME
            continue
        ont['name']
        ont['description']
        ont['id']
Exemple #15
0
 def test_rate(self):
     out = Async(rate=10)(deferred(lambda a:a)('lol') for _ in range(10))
Exemple #16
0
 def test_fast(self):
     out = Async()(deferred(lambda a:a)('lol') for _ in range(1000))
Exemple #17
0
def swanson():
    """ not really a parcellation scheme
        NOTE: the defining information up here is now deprecated
        it is kept around to keep the code further down happy """

    source = auth.get_path('resources') / 'swanson_aligned.txt'
    ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/'
    filename = 'swanson_hierarchies'
    ontid = ONT_PATH + filename + '.ttl'
    PREFIXES = SwansonLabels.prefixes
    new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/')
    new_graph.add_ont(ontid,
                      'Swanson brain partomies',
                      'Swanson 2014 Partonomies',
                      'This file is automatically generated from ' + source.as_posix() + '.' + '**FIXME**',
                      'now')

    # FIXME citations should really go on the ... anatomy? scheme artifact
    definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.'
    definingCitationID = 'ISBN:9780195340624'
    new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation)
    new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID)

    with open(source, 'rt') as f:
        lines = [l.strip() for l in f.readlines()]

    # join header on page 794
    lines[635] += ' ' + lines.pop(636)
    #fix for capitalization since this header is reused
    fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN')
    lines[635] = fixed

    data = []
    for l in lines:
        if not l.startswith('#'):
            level = l.count('.'*5)
            l = l.strip('.')
            if ' (' in l:
                if ') or' in l:
                    n1, l = l.split(') or')
                    area_name, citationP =  n1.strip().split(' (')
                    citation = citationP.rstrip(')')
                    d = (level, area_name, citation, 'NEXT SYN')
                    data.append(d)
                    #print(tc.red(tc.bold(repr(d))))

                area_name, citationP =  l.strip().split(' (')
                citation = citationP.rstrip(')')
            else:
                area_name = l
                citation = None

            d = (level, area_name, citation, None)
            #print(d)
            data.append(d)
    results = Async()(deferred(sgv.findByTerm)(d[1]) for d in data)
    #results = [None] * len(data)
    curies = [[r['curie'] for r in _ if 'curie' in r and 'UBERON' in r['curie']] if _ else [] for _ in results]
    output = [_[0] if _ else None for _ in curies]

    header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon']
    zoop = [header] + [r for r in zip(*zip(*data), output)] + \
            [(0, 'Appendix END None', None, None, None)]  # needed to add last appendix

    # TODO annotate the appendicies and the classes with these
    appendix_root_mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1)  # should generate?

    class SP(rowParse):
        def __init__(self):
            self.nodes = defaultdict(dict)
            self._appendix = 0
            self.appendicies = {}
            self._last_at_level = {}
            self.names = defaultdict(set)
            self.children = defaultdict(set)
            self.parents = defaultdict(set)
            self.next_syn = False
            super().__init__(zoop)

        def Depth(self, value):
            if self.next_syn:
                self.synonym = self.next_syn
            else:
                self.synonym = False
            self.depth = value

        def Name(self, value):
            self.name = value

        def Citation(self, value):
            self.citation = value

        def NextSyn(self, value):
            if value:
                self.next_syn = self._rowind
            else:
                self.next_syn = False

        def Uberon(self, value):
            self.uberon = value

        def _row_post(self):
            # check if we are in the next appendix
            # may want to xref ids between appendicies as well...
            if self.depth == 0:
                if self.name.startswith('Appendix'):
                    if self._appendix:
                        self.appendicies[self._appendix]['children'] = dict(self.children)
                        self.appendicies[self._appendix]['parents'] = dict(self.parents)
                        self._last_at_level = {}
                        self.children = defaultdict(set)
                        self.parents = defaultdict(set)
                    _, num, apname = self.name.split(' ', 2)
                    if num == 'END':
                        return
                    self._appendix = int(num)
                    self.appendicies[self._appendix] = {
                        'name':apname.capitalize(),
                        'type':self.citation.capitalize() if self.citation else None}
                    return
                else:
                    if ' [' in self.name:
                        name, taxonB = self.name.split(' [')
                        self.name = name
                        self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize()
                    else:  # top level is animalia
                        self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize()

                    self.name = self.name.capitalize()
                    self.citation = self.citation.capitalize()
            # nodes
            if self.synonym:
                self.nodes[self.synonym]['synonym'] = self.name
                self.nodes[self.synonym]['syn-cite'] = self.citation
                self.nodes[self.synonym]['syn-uberon'] = self.uberon
                return
            else:
                if self.citation:  # Transverse Longitudinal etc all @ lvl4
                    self.names[self.name + ' ' + self.citation].add(self._rowind)
                else:
                    self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label']
                    #print(level, self.name)
                    # can't return here because they are their own level
                # replace with actually doing something...
                self.nodes[self._rowind]['label'] = self.name
                self.nodes[self._rowind]['citation'] = self.citation
                self.nodes[self._rowind]['uberon'] = self.uberon
            # edges
            self._last_at_level[self.depth] = self._rowind
            # TODO will need something to deal with the Lateral/
            if self.depth > 0:
                try:
                    parent = self._last_at_level[self.depth - 1]
                except:
                    breakpoint()
                self.children[parent].add(self._rowind)
                self.parents[self._rowind].add(parent)

        def _end(self):
            replace = {}
            for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]:
                replace_with, to_replace = asdf[0], asdf[1:]
                for r in to_replace:
                    replace[r] = replace_with

            for r, rw in replace.items():
                #print(self.nodes[rw])
                o = self.nodes.pop(r)
                #print(o)

            for vals in self.appendicies.values():
                children = vals['children']
                parents = vals['parents']
                # need reversed so children are corrected before swap
                for r, rw in reversed(sorted(replace.items())):
                    if r in parents:
                        child = r
                        new_child = rw
                        parent = parents.pop(child)
                        parents[new_child] = parent
                        parent = list(parent)[0]
                        children[parent].remove(child)
                        children[parent].add(new_child)
                    if r in children:
                        parent = r
                        new_parent = rw
                        childs = children.pop(parent)
                        children[new_parent] = childs
                        for child in childs:
                            parents[child] = {new_parent}

            self.nodes = dict(self.nodes)

    sp = SP()
    tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])]
    #print('\n'.join(tp))
    #print(sp.appendicies[1].keys())
    #print(sp.nodes[1].keys())
    nbase = PREFIXES['SWAN'] + '%s'
    json_ = {'nodes':[],'edges':[]}
    parent = ilxtr.swansonBrainRegionConcept

    og = OntGraph()
    for node, anns in sp.nodes.items():
        nid = nbase % node
        new_graph.add_class(nid, parent, label=anns['label'])
        new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation'])
        json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)})
        #if anns['uberon']:
            #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon'])  # issues arrise here...
        [og.add(t) for t in map_term(rdflib.URIRef(nid), anns['label'], prefix='UBERON')]

    og.write(auth.get_path('ontology-local-repo') /
             'ttl/generated/swanson-uberon-mapping.ttl')
    #hrm = [(anns['label'], gn(anns['label'])) for node, anns in sp.nodes.items()]
    #ok = [(h, test, term_source(h, test)) for h, test in hrm if test]
    #notok = [h for h, test in hrm if not test]

    for appendix, data in sp.appendicies.items():
        aid = PREFIXES['SWAA'] + str(appendix)
        new_graph.add_class(aid, label=data['name'].capitalize())
        new_graph.add_trip(aid, 'ilxtr:hasTaxonRank', data['taxon'])  # FIXME appendix is the data artifact...
        children = data['children']
        ahp = 'swanr:hasPart' + str(appendix)
        apo = 'swanr:partOf' + str(appendix)
        new_graph.add_op(ahp, transitive=True)
        new_graph.add_op(apo, inverse=ahp, transitive=True)
        for parent, childs in children.items():  # FIXME does this give complete coverage?
            pid = nbase % parent
            for child in childs:
                cid = nbase % child
                new_graph.add_restriction(pid, ahp, cid)  # note hierarhcy inverts direction
                new_graph.add_restriction(cid, apo, pid)
                json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)})

    return new_graph
Exemple #18
0
    def export_identifier_metadata(self, dump_path, latest_path,
                                   dataset_blobs):

        latest_id_met_path = latest_path / self.id_metadata
        if (self.latest and latest_id_met_path.exists()):
            with open(latest_id_met_path, 'rt') as f:
                blob_id_met = json.load(f)

        else:
            import requests

            def fetch(id):  # FIXME error proof version ...
                try:
                    metadata = id.metadata()
                    metadata['id'] = id
                    return metadata
                except (requests.exceptions.HTTPError,
                        idlib.exc.RemoteError) as e:
                    logd.error(e)
                except (requests.exceptions.ConnectionError,
                        requests.exceptions.SSLError,
                        idlib.exc.ResolutionError) as e:
                    log.error(e)

            def autoid_report_error(id, blob):
                try:
                    return idlib.Auto(id)
                except idlib.exc.MalformedIdentifierError as e:
                    msg = f'{blob["id"]} bad id: {id}'
                    logd.error(msg)
                    return None

            # retrieve doi metadata and materialize it in the dataset
            _dois = set([
                id if isinstance(id, idlib.Stream) else
                (fromJson(id) if isinstance(id, dict) else autoid_report_error(
                    id, blob)) for blob in dataset_blobs for id in chain(
                        adops.get(blob, ['meta', 'protocol_url_or_doi'],
                                  on_failure=[]),
                        adops.get(blob, ['meta', 'originating_article_doi'],
                                  on_failure=[]),
                        # TODO data["links"]?
                        [blob['meta']['doi']] if 'doi' in blob['meta'] else [])
                if id is not None
            ])

            dois = [d for d in _dois if isinstance(d, idlib.Doi)]
            metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois)
            bads = [
                {
                    'id': d,
                    'reason': 'no metadata'
                }  # TODO more granular reporting e.g. 404
                for d, m in zip(dois, metadatas) if m is None
            ]
            metadatas = [m for m in metadatas if m is not None]
            blob_id_met = {
                'id': 'identifier-metadata',  # TODO is this ok ?
                'identifier_metadata': metadatas,
                'errors': bads,
                'meta': {
                    'count': len(metadatas)
                },
                'prov': {
                    'timestamp_export_start': self.timestamp,
                    'export_system_identifier': Path.sysid,
                    'export_hostname': gethostname(),
                    'export_project_path':
                    self.export_source_path.cache.anchor,
                },
            }

        with open(dump_path / self.id_metadata, 'wt') as f:
            json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode)

        return blob_id_met
Exemple #19
0
def url_blaster(urls,
                rate,
                timeout=5,
                verbose=False,
                debug=False,
                method='head',
                fail=False,
                negative=False,
                ok_test=lambda r: r.ok):
    shuffle(urls)  # try to distribute timeout events evenly across workers
    if verbose:
        [print(u) for u in sorted(urls)]

    class Timedout:
        ok = False

        def __init__(self, url):
            self.url = url

    r_method = getattr(requests, method)

    def method_timeout(url, _method=r_method):
        try:
            return _method(url, timeout=timeout)
        except (requests.ConnectTimeout, requests.ReadTimeout) as e:
            print('Timedout:', url, e)
            return Timedout(url)

    s = time()
    collector = [] if debug else None
    all_ = Async(rate=rate, debug=verbose,
                 collector=collector)(deferred(method_timeout)(url)
                                      for url in urls)
    o = time()
    not_ok = [_.url for _ in all_ if not ok_test(_)]
    d = o - s
    print(
        f'Actual time: {d}    Effective rate: {len(urls) / d}Hz    diff: {(len(urls) / d) / rate if rate else 1}'
    )
    print('Failed:')
    if not_ok:
        for nok in not_ok:
            print(nok)
        ln = len(not_ok)
        lt = len(urls)
        lo = lt - ln
        msg = f'{ln} urls out of {lt} ({ln / lt * 100:2.2f}%) are not ok. D:'
        print(msg)  # always print to get around joblib issues
        if negative and fail:
            if len(not_ok) == len(all_):
                raise AssertionError('Everything failed!')
        elif fail:
            raise AssertionError(f'{msg}\n' + '\n'.join(sorted(not_ok)))

    else:
        print(f'OK. All {len(urls)} urls passed! :D')

    if debug:
        from matplotlib.pyplot import plot, savefig, figure, show, legend, title
        from collections import defaultdict

        def asyncVis(collector):
            by_thread = defaultdict(lambda: [[], [], [], [], [], [], [], []])
            min_ = 0
            for thread, job, start, target_stop, stop, time_per_job, p, i, d in sorted(
                    collector):
                if not min_:
                    min_ = stop
                by_thread[thread][0].append(job)
                #by_thread[thread][1].append(start - min_)
                by_thread[thread][2].append(target_stop - stop)
                by_thread[thread][3].append(stop - min_)
                by_thread[thread][4].append(time_per_job)
                by_thread[thread][5].append(p)
                by_thread[thread][6].append(i)
                by_thread[thread][7].append(d)

            for thread, (job, y1, y2, y3, y4, y5, y6, y7) in by_thread.items():
                figure()
                title(str(thread))
                plot(job, [0] * len(job), 'r-')
                #plot(job, y1, label=f'stop')
                plot(job, y2, label=f'early by')
                #plot(job, y3, label=f'stop')
                #plot(job, y4, label=f'time per job')  # now constant...
                plot(job, y5, label='P')
                plot(job, y6, label='I')
                plot(job, y7, label='D')
                legend()
            show()

        asyncVis(collector)
        breakpoint()
Exemple #20
0
 def test_rate_empty(self):
     out = Async(rate=20)(deferred(lambda a:a)('lol') for _ in range(0))
Exemple #21
0
def render(pred, root, direction=None, depth=10, local_filepath=None,
           branch='master', restriction=False, wgb='FIXME', local=False,
           verbose=False, flatten=False,):

    kwargs = {'local':local, 'verbose':verbose}
    prov = makeProv(pred, root, wgb)
    if local_filepath is not None:
        github_link = ('https://github.com/SciCrunch/NIF-Ontology/raw/'
                       f'{branch}/{local_filepath}')
        prov.append('<link rel="http://www.w3.org/ns/prov#wasDerivedFrom" '
                    f'href="{github_link}">')
        graph = graphFromGithub(github_link, verbose)
        qname = graph.namespace_manager._qhrm  # FIXME
        labels_index = {qname(s):str(o) for s, o in graph[:rdfs.label:]}
        if pred == 'subClassOf':
            pred = 'rdfs:subClassOf'  # FIXME qname properly?
        elif pred == 'subPropertyOf':
            pred = 'rdfs:subPropertyOf'
        try:
            kwargs['json'] = graph.asOboGraph(pred, restriction=restriction)
            kwargs['prefixes'] = {k:str(v) for k, v in graph.namespace_manager}
        except KeyError as e:
            if verbose:
                log.error(str(e))
            return abort(422, 'Unknown predicate.')
    else:
        kwargs['graph'] = sgg
        # FIXME this does not work for a generic scigraph load ...
        # and it should not be calculated every time anyway!
        # oh look, here we are needed a class again
        if False:
            versionIRI = [
                e['obj']
                for e in sgg.getNeighbors('http://ontology.neuinfo.org/'
                                          'NIF/ttl/nif.ttl')['edges']
                if e['pred'] == 'versionIRI'][0]
            #print(versionIRI)
            prov.append('<link rel="http://www.w3.org/ns/prov#wasDerivedFrom" '
                        f'href="{versionIRI}">')  # FIXME wrong and wont resolve
        prov.append('<meta name="representation" content="SciGraph">')  # FIXME :/
    kwargs['html_head'] = prov
    try:
        if root.startswith('http'):  # FIXME this codepath is completely busted?
            if 'prefixes' in kwargs:
                rec = None
                for k, v in kwargs.items():
                    if root.startswith(v):
                        rec = k + 'r:' + root.strip(v)  # FIXME what?!
                        break
                if rec is None:
                    raise KeyError('no prefix found for {root}')
            else:
                rec = sgv.findById(root)
            if 'curie' in rec:
                root_curie = rec['curie']
                # FIXME https://github.com/SciGraph/SciGraph/issues/268
                if not root_curie.endswith(':') and '/' not in root_curie:
                    root = root_curie
                else:
                    kwargs['curie'] = root_curie
        elif 'prefixes' not in kwargs and root.endswith(':'):
            kwargs['curie'] = root
            root = sgc._curies[root.rstrip(':')]  # also 268

        tree, extras = creatTree(*Query(root, pred, direction, depth), **kwargs)
        dematerialize(list(tree.keys())[0], tree)
        if flatten:
            if local_filepath is not None:
                def safe_find(n):
                    return {'labels':[labels_index[n]],
                            'deprecated': False  # FIXME inacurate
                           }

            else:
                def safe_find(n):  # FIXME scigraph bug
                    if n.endswith(':'):
                        n = sgc._curies[n.rstrip(':')]
                    elif '/' in n:
                        prefix, suffix = n.split(':')
                        iriprefix = sgc._curies[prefix]
                        n = iriprefix + suffix

                    return sgv.findById(n)

            out = set(n for n in flatten_tree(extras.hierarchy))

            try:
                lrecs = Async()(deferred(safe_find)(n) for n in out)
            except RuntimeError:
                asyncio.set_event_loop(current_app.config['loop'])
                lrecs = Async()(deferred(safe_find)(n) for n in out)

            rows = sorted(((r['labels'][0] if r['labels'] else '')
                           + ',' + n for r, n in zip(lrecs, out)
                           # FIXME still stuff wrong, but better for non cache case
                           if not r['deprecated']), key=lambda lid: lid.lower())
            return '\n'.join(rows), 200, {'Content-Type':'text/plain;charset=utf-8'}

        else:
            return hfn.htmldoc(extras.html,
                               other=prov,
                               styles=hfn.tree_styles)

    except (KeyError, TypeError) as e:
        if verbose:
            log.error(f'{type(e)} {e}')
        if sgg.getNode(root):
            # FIXME distinguish these cases...
            message = 'Unknown predicate or no results.'
        elif 'json' in kwargs:
            message = 'Unknown root.'
            r = graph.namespace_manager.expand(root)
            for s in graph.subjects():
                if r == s:
                    message = ('No results. '
                               'You are querying a ttl file directly, '
                               'did you remember to set ?restriction=true?')
                    break
        else:
            message = 'Unknown root.'

        return abort(422, message)
Exemple #22
0
def would_you_like_to_know_more_question_mark():

    # resolving differences between classes
    more_ids = set((
        'http://uri.neuinfo.org/nif/nifstd/readable/ChEBIid',
        'http://uri.neuinfo.org/nif/nifstd/readable/GOid',
        'http://uri.neuinfo.org/nif/nifstd/readable/MeshUid',
        'http://uri.neuinfo.org/nif/nifstd/readable/PMID',
        'http://uri.neuinfo.org/nif/nifstd/readable/UmlsCui',
        'http://uri.neuinfo.org/nif/nifstd/readable/bamsID',
        'http://uri.neuinfo.org/nif/nifstd/readable/bonfireID',
        'http://uri.neuinfo.org/nif/nifstd/readable/cell_ontology_ID',
        'http://uri.neuinfo.org/nif/nifstd/readable/definingCitationID',
        'http://uri.neuinfo.org/nif/nifstd/readable/definingCitationURI',
        'http://uri.neuinfo.org/nif/nifstd/readable/emapMouseStageDataID',
        'http://uri.neuinfo.org/nif/nifstd/readable/emapMouseStageDiagramID',
        'http://uri.neuinfo.org/nif/nifstd/readable/externalSourceId',
        'http://uri.neuinfo.org/nif/nifstd/readable/externalSourceURI',
        'http://uri.neuinfo.org/nif/nifstd/readable/gbifID',
        'http://uri.neuinfo.org/nif/nifstd/readable/gbifTaxonKeyID',
        'http://uri.neuinfo.org/nif/nifstd/readable/gene_Ontology_ID',
        #'http://uri.neuinfo.org/nif/nifstd/readable/hasExternalSource',
        'http://uri.neuinfo.org/nif/nifstd/readable/hasGenbankAccessionNumber',
        'http://uri.neuinfo.org/nif/nifstd/readable/imsrStandardStrainName',
        'http://uri.neuinfo.org/nif/nifstd/readable/isReplacedByClass',
        'http://uri.neuinfo.org/nif/nifstd/readable/jaxMiceID',
        'http://uri.neuinfo.org/nif/nifstd/readable/ncbiTaxID',
        'http://uri.neuinfo.org/nif/nifstd/readable/neuronamesID',
        'http://uri.neuinfo.org/nif/nifstd/readable/nifID',
        'http://uri.neuinfo.org/nif/nifstd/readable/sao_ID',
        'http://uri.neuinfo.org/nif/nifstd/readable/umls_ID',
        'http://www.geneontology.org/formats/oboInOwl#id',
    ))

    outside = []
    eee = {}
    resolver_not_ilx_only_but_not_in_scigraph = set()  # resources.ttl
    _res = Graph().parse((gitf / 'NIF-Ontology/ttl/resources.ttl').as_posix(), format='turtle')
    reslookup = {uri:[l] for uri, l in _res.subject_objects(rdfs.label)}
    for uri in chain(h_uris, resolver_not_ilx_only):
        if 'uri.neuinfo.org' in uri:
            try:
                meta = sgg.getNode(uri.toPython())['nodes'][0]['meta']
                asdf = {hng.qname(k):v for k, v in meta.items() if k in more_ids}
            except TypeError:
                resolver_not_ilx_only_but_not_in_scigraph.add(uri)  # resources.ttl ;)
                if uri in reslookup:  # no differentia
                    asdf = False
                else:
                    asdf = False
                    print('WTF', uri)
            if asdf:
                #print(uri, asdf)
                eee[uri] = asdf
                for l in asdf.values():
                    for e in l:
                        outside.append(e)

    outside_dupes = [v for v, c in Counter(outside).most_common() if c > 1]
    eee_dupes = {k:v for k, v in eee.items() if anyMembers(outside_dupes, *(e for l in v.values() for e in l))}

    #for uri, meta in sorted(eee_dupes.items(), key=lambda a:sorted(a[1].values())):
        #print(uri.toPython(), sorted((e.replace('PMID: ', 'PMID:'), k) for k, l in meta.items() for e in l))


    # attempt to deal with label mappings
    iexisting = defaultdict(set)
    iiexisting = {}
    for i, existing in zip(datal('ilx'), datal('iri')):
        #if 'uri.neuinfo.org' in existing:
        if 'interlex.org' not in existing and 'neurolex.org' not in existing:
            iexisting[i].add(URIRef(existing))
            iiexisting[URIRef(existing)] = i
    iexisting = {**iexisting}

    _ilabs = {k:l for k, l in zip(datal('ilx'), datal('label'))}
    def inner(iri):
        resp = sgv.findById(iri)
        if resp is not None:
            l = resp['labels']
        else:
            l = [] #_ilabs[iiexisting[iri]] + '** already in ilx **']
            #print('trouble?', iri)  # ilx only
        return iri, l

    #labs = {k:v[0] if v else '<--NO-LABEL-->' for k, v in Async()(deferred(inner)(id_) for id_ in chain(h_uris, (e for s in iexisting.values() for e in s)))}
    labs = {k:v[0] if v else '<--NO-LABEL-->' for k, v in Async()(deferred(inner)(id_) for id_ in h_uris)}
    ilabs = {k:l.lower() for k, l in zip(datal('ilx'), datal('label'))}
    iilabs = {v:k for k, v in ilabs.items()}
    assert len(ilabs) == len(iilabs)
    missing_map = {k:iilabs[v.lower()] for k, v in labs.items() if v and v.lower() in iilabs}  # XXX this is not valid

    missing_existing = {i:[m, *iexisting[i]] for m, i in missing_map.items() if i in iexisting}

    missing_equivs = {next(iter(iexisting[i])):i for m, i in missing_map.items() if i in iexisting}

    eid = NIFRID.externalSourceId.toPython()
    ded = owl.deprecated.toPython()
    # SP: -> swissprot vs uniprot
    mmr = []
    proto_mmr_1_to_1 = {}
    arrr = defaultdict(set)
    uniprot_iuphar = set()
    for uri, ilx_frag in {**missing_equivs, **missing_map}.items():
        uri = URIRef(uri)
        try:
            meta = sgg.getNode(uri.toPython())['nodes'][0]['meta']
        except TypeError:
            # just ignore these, they are ilx only :/
            meta = {}
        if eid in meta:
            src = meta[eid][0]
            if src.startswith('SP:'):
                src = tc.yellow(src.replace('SP:', 'http://www.uniprot.org/uniprot/'))
            #elif src.startswith('IUPHAR:'):
                #pass
            #else:
                #src = 'TODO'
        elif ded in meta and meta[ded]:
            src = tc.red('ded ')
        else:
            src = 'TODO'
        val = labs[uri] if uri in labs else _ilabs[ilx_frag] + ' **'
        if uri in eee:
            differentia = str(eee[uri])
            for v in eee[uri].values():
                for e in v:
                    arrr[e].add(uri)
                    if 'SP:' in e or 'IUPHAR:' in e:
                        uniprot_iuphar.add(uri)
        else:
            differentia = ''

        if uri in _ilx and uri in all_uris:
            ruri = SGG[hng.qname(uri)]
            ruri = tc.blue(f'{ruri:<60}')
        else:
            ruri = uri
            ruri = f'{ruri:<60}'

        v = ' '.join((f'{val:<60}',
                      src,
                      ruri,
                      ilxb[ilx_frag],
                      differentia))
        mmr.append(v)
        proto_mmr_1_to_1[uri] = v
        src = None

    arrr = {**arrr}
    arrr_not_1_to_1 = {k:v for k, v in arrr.items() if len(v) > 1}
    #arrr_n11_uris = set((u.toPython() for v in arrr_not_1_to_1.values() for u in v))
    arrr_n11_uris = set.union(*arrr_not_1_to_1.values())
    mmr_1_to_1 = {k:v for k, v in proto_mmr_1_to_1.items() if k not in arrr_n11_uris}
    no_uniprot = {k:v for k, v in proto_mmr_1_to_1.items() if k not in uniprot_iuphar}
    arrr_n11_text = '\n'.join(f'{k:<15} {sorted(_.toPython() for _ in v)}' for k, v in arrr_not_1_to_1.items())
    mmr.sort()
    mmr_text = '\n'.join(mmr)

    mmr_1_to_1_text = '\n'.join(sorted(mmr_1_to_1.values()))

    no_uniprot_text = '\n'.join(sorted(no_uniprot.values()))