def dataset(self): if self.is_dataset(): return self elif self.parent and self.parent != self: # Path('.') issue log.debug(self.parent) return self.parent.dataset
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) try: if not hasattr(start_uri, 'dereference'): start_uri = idlib.StreamUri(start_uri) end_uri = start_uri.dereference() yield end_uri sc = end_uri.progenitor.status_code if sc > 400: msg = f'error accessing {end_uri} {sc}' if self.addError(msg, blame='submission'): logd.error(msg) except idlib.exceptions.ResolutionError as e: pass # FIXME I think we already log this error? except self._MissingSchema as e: if self.addError(e, blame='submission'): logd.error(e) except OntId.BadCurieError as e: if self.addError(e, blame='submission'): logd.error(e) except BaseException as e: #breakpoint() log.exception(e) log.critical('see exception above')
def update_cache(self): log.debug(f'maybe updating cache for {self.name}') file_is_different = self.cache._meta_updater(self.meta) # update the cache first # then move to the new name if relevant # prevents moving partial metadata onto existing files parent_changed = (hasattr(self._bfobject, 'parent') and self._bfobject.parent != self.cache.parent.id) if self.cache.name != self.name or parent_changed: # this is localy correct # the issue is that move is now smarter # and will detect if a parent path has changed try: self.cache.move(remote=self) except exc.WhyDidntThisGetMovedBeforeError as e: # AAAAAAAAAAAAAAAAAAAAAAAAAAAAA # deal with the sadness that is non-unique filenames # I am 99.999999999999999% certain that users do not # expect this behavior ... log.error(e) if self.bfobject.package.name != self.bfobject.name: argh = self.bfobject.name self.bfobject.name = self.bfobject.package.name try: log.critical( f'Non unique filename :( ' f'{self.cache.name} -> {argh} -> {self.bfobject.name}' ) self.cache.move(remote=self) finally: self.bfobject.name = argh else: raise e return file_is_different
def get(self, uri): #juri = uri + '.json' logd.info(uri) log.debug('going to network for protocols') resp = requests.get(uri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def triples_gen(self, subject): if not (isinstance(subject, rdflib.URIRef) or isinstance(subject, rdflib.BNode)): if isinstance(subject, idlib.Stream): subject = subject.asType(rdflib.URIRef) else: subject = rdflib.URIRef(subject) #maybe_not_normalized = self.message_passing_key in self._source # TODO maybe not here? for field, value in self._source.items(): #normalized = not (maybe_not_normalized and field in self._source) # TODO #log.debug(f'{field}: {value}') if type(field) is object: continue # the magic helper key for Pipeline convert = getattr(self, field, None) extra = getattr(self.extra, field, None) if convert is not None: if isinstance(value, tuple) or isinstance(value, list): values = value else: values = value, for v in values: log.debug(f'{field} {v} {convert}') try: p, o = convert(v) except exc.NoTripleError as e: continue log.debug(o) if isinstance(o, Expr) or isinstance(o, Quantity): s = rdflib.BNode() yield subject, p, s qt = sparc.Measurement if isinstance(o, Range): yield from o.asRdf(s, quantity_rdftype=qt) elif isinstance(o, Quantity): yield from o.asRdf(s, rdftype=qt) else: log.warning(f'unhanded Expr type {o}') yield from o.asRdf(s) else: yield subject, p, o if extra is not None: yield from extra(v) elif field in self.known_skipped: pass else: msg = f'Unhandled {self.__class__.__name__} field: {field}' if msg not in self._already_warned: self._already_warned.add(msg) log.warning(msg) self.addError(msg, pipeline_stage=self.__class__.__name__ + '.export-error')
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) for end_uri in resolution_chain(start_uri): pass else: yield end_uri
def _derive(data, derives, source_key_optional=True, allow_empty=False): # OLD """ derives is a list with the following structure [[[source-path, ...], derive-function, [target-path, ...]], ...] """ # TODO this is an implementaiton of copy that has semantics for handling lists for source_path, function, target_paths in derives: source_prefixes = source_path[:-1] source_key = source_path[-1] source = data failed = False for i, node_key in enumerate(source_prefixes): log.debug(lj(source)) if node_key in source: source = source[node_key] else: msg = f'did not find {node_key} in {source.keys()}' if not i: log.error(msg) failed = True break raise exc.NoSourcePathError(msg) if isinstance(source, list) or isinstance(source, tuple): new_source_path = source_prefixes[i + 1:] + [source_key] new_target_paths = [tp[i + 1:] for tp in target_paths] new_derives = [(new_source_path, function, new_target_paths)] for sub_source in source: _DictTransformer.derive(sub_source, new_derives, source_key_optional=source_key_optional) return # no more to do here if failed: continue # sometimes things are missing we continue to others if source_key not in source: msg = f'did not find {source_key} in {source.keys()}' if source_key_optional: return logd.info(msg) else: raise exc.NoSourcePathError(msg) source_value = source[source_key] new_values = function(source_value) if len(new_values) != len(target_paths): log.debug(f'{source_paths} {target_paths}') raise TypeError(f'wrong number of values returned for {function}\n' f'was {len(new_values)} expect {len(target_paths)}') #temp = b'__temporary' #data[temp] = {} # bytes ensure no collisions for target_path, value in zip(target_paths, new_values): if (not allow_empty and (value is None or hasattr(value, '__iter__') and not len(value))): raise ValueError(f'value to add to {target_path} may not be empty!') adops.add(data, target_path, value, fail_on_exists=True)
def protocol_url_or_doi(self, value): #_, s = self.c.protocol_url_or_doi(value) #yield s, rdf.type, owl.NamedIndividual #yield s, rdf.type, sparc.Protocol log.debug(value) if not isinstance(value, idlib.Pio): if isinstance(value, idlib.Doi): try: t = None for t in value.triples_gen: yield t except idlib.exc.RemoteError as e: if t is None: # we already logged this error during id dereferencing return ds, _, _ = t try: pioid = value.dereference(asType=idlib.Pio) s = self.c.l(pioid) yield ds, TEMP.dereferencesTo, s yield s, TEMP.hasDoi, ds except idlib.exc.MalformedIdentifierError as e: log.warning(e) return else: try: pioid = idlib.Pio( value ) # FIXME :/ should be handled in Pio directly probably? except idlib.exc.MalformedIdentifierError as e: logd.warning(e) return else: pioid = value try: pioid_int = pioid.uri_api_int s = self.c.l(pioid_int) yield from pioid_int.triples_gen # FIXME needs to be a pipeline so that we can export errors try: data = pioid.data() except (OntId.BadCurieError, idlib.exc.MalformedIdentifierError) as e: loge.error(e) # FIXME export errors ... data = None except idlib.exc.RemoteError as e: # FIXME sandbox violation loge.exception(e) s = self.c.l(pioid) data = None yield s, rdf.type, sparc.Protocol if data: yield s, rdfs.label, rdflib.Literal(pioid.label) nsteps = len(data['steps']) yield s, TEMP.protocolHasNumberOfSteps, rdflib.Literal(nsteps)
def get_file_by_url(cls, url): """ NOTE THAT THE FIRST YIELD IS HEADERS """ resp = requests.get(url, stream=True) headers = resp.headers yield headers log.debug(f'reading from {url}') for chunk in resp.iter_content(chunk_size=4096): # FIXME align chunksizes between local and remote if chunk: yield chunk
def pull( self, *args, paths=None, time_now=None, debug=False, n_jobs=12, cache_anchor=None, log_name=None, log_level='INFO', # pass in Parallel in at run time if needed Parallel=None, delayed=None, _in_parallel=False, exclude_uploaded=True, ): # TODO usage errors if time_now is None: time_now = GetTimeNow() log.debug('No time provided to pull so using ' f'{time_now.START_TIMESTAMP}') if _in_parallel: _log = logging.getLogger(log_name) _log.setLevel(log_level) rc = self._remote_class if not hasattr(rc, '_cache_anchor'): rc.anchorTo(cache_anchor) else: _log = log cache = self.cache if cache.is_organization(): if debug or Parallel is None or n_jobs <= 1: for child in self.children: if paths is None or child in paths: child.pull() else: Parallel(n_jobs=n_jobs)(delayed(child.pull)( _in_parallel=True, time_now=time_now, cache_anchor=cache.anchor, log_name=_log.name, log_level=log_level, exclude_uploaded=exclude_uploaded, ) for child in self.children if paths is None or child in paths) elif cache.is_dataset(): self._pull_dataset( time_now, exclude_uploaded) # XXX actual pull happens in here else: raise NotImplementedError(self)
def update_cache(self): log.debug(f'updating cache for {self.name}') if self.cache.name != self.name: # this is localy correct # the issue is that move is now smarter # and will detect if a parent path has changed self.cache.move(remote=self) file_is_different = self.cache._meta_updater(self.meta) return file_is_different
def fromId(cls, identifier, cache_class, local_class): # FIXME decouple class construction for identifier binding # _api is not required at all and can be bound explicitly later api = cls._api_class(identifier) self = RemoteFactory.__new__(cls, local_class, cache_class, _api=api) self._errors = [] self.root = self._api.root log.debug( 'When initializing a remote using fromId be sure to set the cache anchor ' 'before doing anything else, otherwise you will have a baaad time') return self
def allOf(obj): for o in obj['allOf']: if '$ref' in o: ref = o['$ref'] if ref in types: yield types[ref] else: jpath = ref_to_list(ref) no = adops.get(schema, jpath) yield top(jpath[-1], no) else: log.debug(f'{obj}')
def etag(self): """ NOTE returns checksum, count since it is an etag""" # FIXME rename to etag in the event that we get proper checksumming ?? if hasattr(self.bfobject, 'checksum'): checksum = self.bfobject.checksum if checksum and '-' in checksum: log.debug(checksum) if isinstance(checksum, str): checksum, strcount = checksum.rsplit('-', 1) count = int(strcount) #if checksum[-2] == '-': # these are 34 long, i assume the -1 is a check byte? #return bytes.fromhex(checksum[:-2]) return bytes.fromhex(checksum), count
def subpipeline(data, runtime_context, subpipelines, update=True, source_key_optional=True, lifters=None): """ [[[[get-path, add-path], ...], pipeline-class, target-path], ...] NOTE: this function is a generator, you have to express it! """ class DataWrapper: def __init__(self, data): self.data = data prepared = [] for get_adds, pipeline_class, target_path in subpipelines: selected_data = {} ok = True for get_path, add_path in get_adds: try: value = adops.get(data, get_path) if add_path is not None: adops.add(selected_data, add_path, value) else: selected_data = value except exc.NoSourcePathError as e: if source_key_optional: yield get_path, e, pipeline_class ok = False break # breaks the inner loop else: raise e if not ok: continue log.debug(lj(selected_data)) prepared.append( (target_path, pipeline_class, DataWrapper(selected_data), lifters, runtime_context)) function = adops.update if update else adops.add for target_path, pc, *args in prepared: p = pc(*args) if target_path is not None: function(data, target_path, p.data) else: p.data # trigger the pipeline since it is stateful yield p
def _rchildren(self, create_cache=True): if isinstance(self.bfobject, File): return elif isinstance(self.bfobject, DataPackage): return # should we return files inside packages? are they 1:1? elif any( isinstance(self.bfobject, t) for t in (Organization, Collection)): for child in self.children: yield child yield from child.rchildren elif isinstance(self.bfobject, Dataset): for bfobject in self.bfobject.packages: child = self.__class__(bfobject) if child.is_dir() or child.is_file(): if child.is_file(): cid = child.id existing = [ c for c in self.cache.local.children if (c.is_file() and c.cache or c.is_broken_symlink()) and c.cache.id == cid ] if existing: unmatched = [ e for e in existing if child.name != e.name ] if unmatched: log.debug( f'skipping {child.name} becuase a file with that ' f'id already exists {unmatched}') continue if create_cache: # FIXME I don't think existing detection is working # correctly here so this get's triggered incorrectly? self.cache / child # construction will cause registration without needing to assign assert child.cache is not None yield child else: # probably a package that has files log.debug( f'skipping {child} becuase it is neither a directory nor a file' ) else: raise exc.UnhandledTypeError # TODO
def check_for_updates(project_id): datasets = datasets_remote_from_project_id(project_id) #datasets = sorted(datasets, key=lambda r:r.id)[:3] for dataset in datasets: dataset_id = dataset.id sid = 'state-' + dataset_id uid = 'updated-' + dataset_id fid = 'failed-' + dataset_id qid = 'queued-' + dataset_id _updated = conn.get(uid) updated = _updated.decode() if _updated is not None else _updated _qupdated = conn.get(qid) qupdated = _qupdated.decode() if _qupdated is not None else _qupdated _failed = conn.get(fid) failed = _failed.decode() if _failed is not None else _failed _state = conn.get(sid) state = int(_state) if _state is not None else _state rq = state == _qed_run running = state == _run or rq queued = state == _qed or rq #log.debug(f'STATUS :id {dataset_id} :u {updated} :f {failed} :q {queued} :r {running}') # All the logic for whether to run a particular dataset # timestamp_updated or timestamp_updated_contents whichever is greater # NOTE we populate updated values into redis at startup from # the latest export of each individual dataset # TODO also need to check sparcur code changes to see if we need to rerun if (not (updated or failed) or failed and dataset.updated > failed or not failed and updated and dataset.updated > updated): log.debug((f'MAYBE ENQUEUE :id {dataset_id} du: ' f'{dataset.updated} u: {updated} f: {failed}')) if queued: pass elif running and updated and qupdated and updated > qupdated: conn.incr(sid) else: conn.incr(sid) conn.set(qid, dataset.updated) export_single_dataset.delay(dataset_id, dataset.updated)
def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info(uri) pi = get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def _render(self, e, stage, blame, path): o = {'pipeline_stage': stage, 'blame': blame,} # FIXME if path is not None: o['file_path'] = path if isinstance(e, str): o['message'] = e o['type'] = None # FIXME probably wan our own? elif isinstance(e, BaseException): o['message'] = str(e) o['type'] = str(type(e)) else: raise TypeError(repr(e)) log.debug(o) return o
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... if self.local_object_cache_path.exists(): gen = chain((f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise FileNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.debug(self.data_headers) if self.local_object_cache_path.exists(): yield from gen else: yield from self.local_object_cache_path._data_setter(gen) self.local_object_cache_path.cache_init( self.meta) # FIXME self.meta be stale here?!
def update_from_ir(self, ir): oi = OntTerm.query._instrumented if oi is not OntTerm: OntTerm.query._instrumented = OntTerm def cformat(cell): if isinstance(cell, OntTerm): cell = cell.asCell() return cell try: dataset_blobs = ir['datasets'] self._wat = self.values[8] for blob in dataset_blobs: meta = blob['meta'] #species = adops.get(blob, ['subjects', int, 'species'], on_failure='') # TODO not implemented if 'subjects' in blob: species = '\n'.join( sorted( set([ cformat(s['species']) for s in blob['subjects'] if 'species' in s ]))) else: species = '' self._update_dataset_metadata( id=blob['id'], name=adops.get(blob, ['meta', 'folder_name'], on_failure=''), award=adops.get(blob, ['meta', 'award_number'], on_failure=''), species=species, ) finally: # FIXME this is so dumb :/ OntTerm.query._instrumented = oi log.debug(self.uncommitted()) self.commit()
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... if self.local_object_cache_path.exists(): locsize = self.local_object_cache_path.size if locsize != meta.size: raise NotImplementedError( 'TODO yield from local then fetch the rest starting at offset' ) gen = chain((f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: if not hasattr(self._remote_class, '_api'): # NOTE we do not want to dereference self.remote # in this situation because we just want the file # not the FS metadata, so we have to ensure that _api # is bound self._remote_class.anchorToCache(self.anchor) gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise exc.CacheNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.debug(self.data_headers) if self.local_object_cache_path.exists(): yield from gen else: # FIXME we MUST write the metadata first so that we know the expected size # so that in the event that the generator is only partially run out we know # that we can pick up where we left off with the fetch, this also explains # why all the cases where the cached data size did not match were missing # xattrs entirely self.local_object_cache_path.touch() self.local_object_cache_path.cache_init(meta) yield from self.local_object_cache_path._data_setter(gen) ls = self.local_object_cache_path.size if ls != meta.size: self.local_object_cache_path.unlink() msg = f'{ls} != {meta.size} for {self}' raise ValueError(msg) # FIXME TODO
def subpipeline(cls, data, runtime_context, subpipelines, update=True, source_key_optional=True, lifters=None): """ [[[[get-path, add-path], ...], pipeline-class, target-path], ...] NOTE: this function is a generator, you have to express it! """ class DataWrapper: def __init__(self, data): self.data = data prepared = [] for get_adds, pipeline_class, target_path in subpipelines: selected_data = {} ok = True for get_path, add_path in get_adds: try: value = adops.get(data, get_path) if add_path is not None: adops.add(selected_data, add_path, value) else: selected_data = value except exc.NoSourcePathError as e: if source_key_optional: yield get_path, e, pipeline_class ok = False break # breaks the inner loop else: raise e if not ok: continue log.debug(lj(selected_data)) prepared.append((target_path, pipeline_class, DataWrapper(selected_data), lifters, runtime_context)) function = adops.update if update else adops.add for target_path, pc, *args in prepared: p = pc(*args) if target_path is not None: try: function(data, target_path, p.data) except BaseException as e: import inspect if isinstance(pc, object): pi, pc = pc, pc.__class__ try: __file = inspect.getsourcefile(pc) __line = ' line ' + inspect.getsourcelines(pc)[-1] except TypeError as e2: __file = f'<Thing that is not defined in a file: {pc}>' __line = '' if hasattr(p, 'path'): __path = f'"{p.path}"' else: __path = 'unknown input' raise exc.SubPipelineError( f'Error while processing {p}.data for\n{__path}\n' f'{__file}{__line}') from e else: p.data # trigger the pipeline since it is stateful yield p