Example #1
0
 def autoid_report_error(id, blob):
     try:
         return idlib.Auto(id)
     except idlib.exc.MalformedIdentifierError as e:
         msg = f'{blob["id"]} bad id: {id}'
         logd.error(msg)
         return None
Example #2
0
    def export_identifier_metadata(self, dump_path, dataset_blobs):

        if (self.latest and self.latest_id_met_path.exists()):
            blob_id_met = self.latest_id_met

        else:

            def fetch(id):  # FIXME error proof version ...
                try:
                    metadata = id.metadata()
                    metadata['id'] = id.identifier  # FIXME normalization ...
                    return metadata
                except requests.exceptions.HTTPError as e:
                    logd.error(e)
                except (requests.exceptions.ConnectionError,
                        requests.exceptions.SSLError) as e:
                    log.error(e)

            # retrieve doi metadata and materialize it in the dataset
            _dois = set([
                idlib.Auto(id) if not isinstance(id, idlib.Stream) else id
                for blob in dataset_blobs for id in chain(
                    adops.get(blob, ['meta', 'protocol_url_or_doi'],
                              on_failure=[]),
                    adops.get(blob, ['meta', 'originating_article_doi'],
                              on_failure=[]),
                    # TODO data["links"]?
                    [blob['meta']['doi']]) if id is not None
            ])

            dois = [d for d in _dois if isinstance(d, idlib.Doi)]
            metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois)
            bads = [
                {
                    'id': d,
                    'reason': 'no metadata'
                }  # TODO more granular reporting e.g. 404
                for d, m in zip(dois, metadatas) if m is None
            ]
            metadatas = [m for m in metadatas if m is not None]
            blob_id_met = {
                'id': 'identifier-metadata',  # TODO is this ok ?
                'identifier_metadata': metadatas,
                'errors': bads,
                'meta': {
                    'count': len(metadatas)
                },
                'prov': {
                    'timestamp_export_start': self.timestamp,
                    'export_system_identifier': Path.sysid,
                    'export_hostname': gethostname(),
                    'export_project_path':
                    self.export_source_path.cache.anchor,
                },
            }

        with open(dump_path / 'identifier-metadata.json', 'wt') as f:
            json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode)

        return blob_id_met
Example #3
0
    def triples(self):
        crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi')
        for blob in self.data['identifier_metadata']:
            id = blob['id']
            if not isinstance(id, idlib.Stream):
                id = idlib.Auto(id)

            if not hasattr(id, 'asUri'):
                breakpoint()

            s = id.asUri(rdflib.URIRef)
            if 'source' in blob:
                source = blob['source']  # FIXME we need to wrap this in our normalized representation
                if source == 'Crossref':  # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl
                    pos = (
                        (rdf.type, owl.NamedIndividual),
                        (rdf.type, TEMP[blob['type']]),
                        (dc.publisher, blob['publisher']),
                        #(dc.type, blob['type']),  # FIXME semantify
                        (dc.title, blob['title']),
                        (dc.date, self.published_online(blob)),  # FIXME .... dangerzone
                    )
                    g = OntGraph()
                    doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id  # FIXME idlib streams need to recognize their own type in __new__
                    data = doi.ttl()
                    if data is None:  # blackfynn has some bad settings on their doi records ...
                        return

                    try:
                        g.parse(data=data, format='ttl')  # FIXME network bad
                    except BaseException as e:
                        loge.exception(e)

                    _tr = [s for s, p, o in g if p == crossref_doi_pred]
                    if _tr:
                        _their_record_s = _tr[0]
                        yield s, owl.sameAs, _their_record_s
                        yield from g
                    else:
                        g.debug()
                        log.critical('No crossref doi section in graph!')
                else:
                    msg = f'dont know what to do with {source}'
                    log.error(msg)
                    #raise NotImplementedError(msg)
                    return
            else:
                msg = f'dont know what to do with {blob} for {id.identifier}'
                log.error(msg)
                #raise NotImplementedError(msg)
                return

            for p, oraw in pos:
                if oraw is not None:
                    o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw
                    yield s, p, o
Example #4
0
    def triples(self):
        for blob in self.data['identifier_metadata']:
            id = blob['id']
            if not isinstance(id, idlib.Stream):
                id = idlib.Auto(id)

            s = id.asType(rdflib.URIRef)
            if 'source' in blob:
                source = blob[
                    'source']  # FIXME we need to wrap this in our normalized representation
                if source == 'Crossref':  # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl
                    pos = (
                        (rdf.type, owl.NamedIndividual),
                        (rdf.type, TEMP[blob['type']]),
                        (dc.publisher, blob['publisher']),
                        #(dc.type, blob['type']),  # FIXME semantify
                        (dc.title, blob['title']),
                        (dc.date,
                         self.published_online(blob)),  # FIXME .... dangerzone
                    )
                    g = OntGraph()
                    doi = idlib.Doi(id) if not isinstance(
                        id, idlib.Doi
                    ) else id  # FIXME idlib streams need to recognize their own type in __new__
                    g.parse(data=doi.ttl(), format='ttl')  # FIXME network bad
                    _their_record_s = [
                        s for s, p, o in g if p == rdflib.term.URIRef(
                            'http://prismstandard.org/namespaces/basic/2.1/doi'
                        )
                    ][0]
                    yield s, owl.sameAs, _their_record_s
                    yield from g
                else:
                    msg = f'dont know what to do with {source}'
                    log.error(msg)
                    #raise NotImplementedError(msg)
                    return
            else:
                msg = f'dont know what to do with {blob} for {id.identifier}'
                log.error(msg)
                #raise NotImplementedError(msg)
                return

            for p, oraw in pos:
                if oraw is not None:
                    o = rdflib.Literal(oraw) if not isinstance(
                        oraw, rdflib.URIRef) else oraw
                    yield s, p, o