def _json_identifier_expansion(obj, *args, **kwargs): if not isinstance(obj, oq.OntTerm): if isinstance(obj, rdflib.URIRef): obj = OntId(obj) if isinstance(obj, oq.OntId): obj = obj.asInstrumented() if isinstance(obj, oq.OntTerm): oc = obj.__class__ obj.__class__ = OntTerm # that this works is amazing/terrifying try: return obj.asDict() finally: obj.__class__ = oc elif isinstance(obj, idlib.Stream): if obj._id_class is str: return obj.identifier else: try: return obj.asDict() except idlib.exc.RemoteError as e: logd.error(e) else: return obj
def get(self, uri): #juri = uri + '.json' logd.info(uri) log.debug('going to network for protocols') resp = requests.get(uri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def autoid_report_error(id, blob): try: return idlib.Auto(id) except idlib.exc.MalformedIdentifierError as e: msg = f'{blob["id"]} bad id: {id}' logd.error(msg) return None
def protocol_url_or_doi(self, value): #_, s = self.c.protocol_url_or_doi(value) #yield s, rdf.type, owl.NamedIndividual #yield s, rdf.type, sparc.Protocol log.debug(value) if not isinstance(value, idlib.Pio): if isinstance(value, idlib.Doi): try: t = None for t in value.triples_gen: yield t except idlib.exc.RemoteError as e: if t is None: # we already logged this error during id dereferencing return ds, _, _ = t try: pioid = value.dereference(asType=idlib.Pio) s = self.c.l(pioid) yield ds, TEMP.dereferencesTo, s yield s, TEMP.hasDoi, ds except idlib.exc.MalformedIdentifierError as e: log.warning(e) return else: pioid = idlib.Pio( value ) # FIXME :/ should be handled in Pio directly probably? else: pioid = value try: pioid_int = pioid.uri_api_int s = self.c.l(pioid_int) # FIXME needs to be a pipeline so that we can export errors try: data = pioid.data() except OntId.BadCurieError as e: loge.error(e) # FIXME export errors ... data = None except idlib.exc.RemoteError as e: # FIXME sandbox violation loge.exception(e) s = self.c.l(pioid) data = None yield s, rdf.type, sparc.Protocol if data: yield s, rdfs.label, rdflib.Literal(pioid.label) nsteps = len(data['steps']) yield s, TEMP.protocolHasNumberOfSteps, rdflib.Literal(nsteps) try: yield from self.integrator.triples_protcur(s) except OntId.BadCurieError as e: logd.error(e) # FIXME export errors ...
def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id.identifier # FIXME normalization ... return metadata except requests.exceptions.HTTPError as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError) as e: log.error(e)
def fetch(id): # FIXME error proof version ... try: metadata = id.metadata() metadata['id'] = id return metadata except (requests.exceptions.HTTPError, idlib.exc.RemoteError) as e: logd.error(e) except (requests.exceptions.ConnectionError, requests.exceptions.SSLError, idlib.exc.ResolutionError) as e: log.error(e)
def dereference_all_identifiers(obj, stage, *args, path=None, addError=None, **kwargs): try: dict_literal = _json_identifier_expansion(obj) except idlib.exc.RemoteError as e: if hasattr(obj, '_cooldown'): return obj._cooldown() # trigger cooldown to simplify issues down the line error = dict(error=e, pipeline_stage=stage.__class__.__name__, blame='submission', path=tuple(path)) if addError: if addError(**error): log.exception(e) #logd.error(msg) else: return {'errors': [error]} except idlib.exc.ResolutionError as e: if hasattr(obj, '_cooldown'): return obj._cooldown() # trigger cooldown to simplify issues down the line oops = json_export_type_converter(obj) msg = (f'{stage.lifters.id} could not resolve ' # FIXME lifters sigh f'{type(obj)}: {oops} {obj.asUri()}') error = dict(error=msg, pipeline_stage=stage.__class__.__name__, blame='submission', path=tuple(path)) if addError: if addError(**error): logd.error(msg) else: return {'errors': [error]} except Exception as e: log.critical(f'Unhandled exception {e} in {path}') error = dict(error=e, pipeline_stage=stage.__class__.__name__, blame='stage', path=tuple(path)) if addError: if addError(**error): log.exception(e) #logd.error(msg) else: return {'errors': [error]}
def schema_wrapped_property(_self): data = function(_self) ok, norm_or_error, data = schema.validate(data) if not ok: if fail: logd.error( 'schema validation has failed and fail=True') breakpoint() raise norm_or_error if 'errors' not in data: data['errors'] = [] data['errors'] += norm_or_error.json(pipeline_stage_name) # TODO make sure the step is noted even if the schema is the same elif self.normalize: return norm_or_error return data
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) try: if not hasattr(start_uri, 'dereference'): start_uri = idlib.StreamUri(start_uri) end_uri = start_uri.dereference() yield end_uri sc = end_uri.progenitor.status_code if sc > 400: msg = f'error accessing {end_uri} {sc}' if self.addError(msg, blame='submission'): logd.error(msg) except idlib.exceptions.ResolutionError as e: pass # FIXME I think we already log this error? except self._MissingSchema as e: if self.addError(e, blame='submission'): logd.error(e) except OntId.BadCurieError as e: if self.addError(e, blame='submission'): logd.error(e) except BaseException as e: #breakpoint() log.exception(e) log.critical('see exception above')
def schema_wrapped_property(_self, *args, **kwargs): data = function(_self, *args, **kwargs) ok, norm_or_error, data = schema.validate(data) if not ok: if fail: logd.error('schema validation has failed and fail=True') raise norm_or_error try: if 'errors' not in data: data['errors'] = [] except BaseException as e: raise exc.SparCurError( f'Error from {_self.__class__.__name__}.' f'{function.__name__}') from e data['errors'] += norm_or_error.json(pipeline_stage_name) # TODO make sure the step is noted even if the schema is the same elif self.normalize: return norm_or_error return data
def protocol_url_or_doi(self, value): _, s = self.c.protocol_url_or_doi(value) yield s, rdf.type, owl.NamedIndividual yield s, rdf.type, sparc.Protocol pd = ProtocolData(self.integrator.id) # FIXME needs to be a pipeline so that we can export errors try: pj = pd( value ) # FIXME a bit opaque, needs to move to a pipeline, clean up init etc. except OntId.BadCurieError as e: logd.error(e) # FIXME export errors ... pj = None if pj: label = pj['protocol']['title'] yield s, rdfs.label, rdflib.Literal(label) nsteps = len(pj['protocol']['steps']) yield s, TEMP.protocolHasNumberOfSteps, rdflib.Literal(nsteps) try: yield from self.integrator.triples_protcur(s) except OntId.BadCurieError as e: logd.error(e) # FIXME export errors ...
def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info(uri) pi = get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def validate_path_json_metadata(cls, path_meta_blob): from sparcur.core import HasErrors # FIXME he = HasErrors(pipeline_stage=cls.__name__ + '.validate_path_json_metadata') mimetypes, suffixes = cls._file_type_status_lookup( ) # SIGH this overhead is 2 function calls and a branch for i, path_meta in enumerate(path_meta_blob['data']): if path_meta['basename'] in cls._banned_basenames: msg = f'illegal file detect {path_meta["basename"]}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i)): logd.error(msg) status = 'banned' path_meta['status'] = status continue if 'magic_mimetype' in path_meta and 'mimetype' in path_meta: # FIXME NOT clear whether magic_mimetype should be used by itself # usually magic and file extension together work, magic by itself # can give some completely bonkers results source = 'magic_mimetype' mimetype = path_meta['magic_mimetype'] muggle_mimetype = path_meta['mimetype'] if mimetype != muggle_mimetype: msg = f'mime types do not match {mimetype} != {muggle_mimetype}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i)): log.error(msg) elif 'magic_mimetype' in path_meta: source = 'magic_mimetype' mimetype = path_meta['magic_mimetype'] elif 'mimetype' in path_meta: source = 'mimetype' mimetype = path_meta['mimetype'] else: mimetype = None if mimetype is not None: try: status = mimetypes[mimetype] if status == 'banned': msg = f'banned mimetype detected {mimetype}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i, source)): logd.error(msg) except KeyError as e: status = 'known' if mimetype not in cls._unclassified_mimes: cls._unclassified_mimes.add(mimetype) log.info(f'unclassified mimetype {mimetype}') else: status = 'unknown' dsrp = path_meta['dataset_relative_path'] if isinstance(dsrp, str): if not dsrp: msg = f'FIXME top level folder needs a mimetype!' else: msg = f'unknown mimetype {path_meta["basename"]}' else: msg = f'unknown mimetype {"".join(dsrp.suffixes)}' cls._unknown_suffixes.add(tuple(dsrp.suffixes)) if he.addError(msg, path=dsrp, json_path=('data', i)): logd.warning(msg) path_meta['status'] = status if he._errors_set: he.embedErrors(path_meta_blob)