def latest_ir(self): if not self.__class__._pyru_loaded: self.__class__._pyru_loaded = True from pysercomb.pyr import units as pyru [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)] pyru.Term._OntTerm = OntTerm # the tangled web grows ever deeper :x return super().latest_ir
# fact that the remote fileid is not known, but we are inside # the code that actually makes that update ... # and the absense of a staging area / history makes the cache.meta # the only place we can do this besides the object store, especially # if we lack the ability to retrieve checksums for single files (sigh) # we already have the latest data (ignoring concurency) remote.update_cache( cache=self.cache, fetch=False) # FIXME fetch=False => different diff rule return remote Path._bind_flavours() register_type(Path, 'path') class StashCache(BlackfynnCache): def remote(self): return None def _meta_setter(self, value): if not self.meta: super()._meta_setter(value) else: raise TypeError('you dont want to set a stashed cache') class StashPath(Path): _cache_class = StashCache
if element in ('oneOf', 'anyOf', 'allOf'): next(gen) continue if element == 'properties': yield next(gen) elif element == 'items': # NOTE it is ok to use int in cases like this because even # though it is in principle ambiguous with a case where # someone happens to be using the python function int # as a key in a dict they would never be able to serialize # that to json without some major conversions, so I am # ruling that it is safe to lift to type here since this # is JPointer not rando dict pointer yield int except StopIteration: break @classmethod def fromList(cls, iterable): return cls('#/' + '/'.join(iterable)) def asList(self): return self.split('/') # register idlib classes for fromJson [register_type(i, i.__name__) for i in (idlib.Ror, idlib.Doi, idlib.Orcid, idlib.Pio, idlib.Rrid, OntTerm)]
class ExportXml(ExportBase): """ convert and export metadata embedded in xml files """ export_type = 'filetype' filename_json = 'xml-export.json' def export(self, dataset_paths=tuple(), **kwargs): return super().export(dataset_paths=dataset_paths, **kwargs) def export_other_formats(self, *args, **kwargs): pass register_type(None, 'all-xml-files') # FIXME VERY BAD TO NEED TO CALL THIS HERE def make_ir(self, dataset_paths=tuple(), jobs=None, debug=False): from sparcur.extract import xml as exml def do_xml_metadata(local, id): # FIXME HACK needs its own pipeline local_xmls = list(local.rglob('*.xml')) missing = [p.as_posix() for p in local_xmls if not p.exists()] if missing: oops = "\n".join(missing) raise BaseException(f'unfetched children\n{oops}') blob = { 'type': 'all-xml-files', # FIXME not quite correct use of type here 'dataset_id': id, 'xml': tuple() } blob['xml'] = [ { 'path': x.relative_to(local).as_posix(), 'type': 'path', 'mimetype': e.mimetype, # FIXME should this in the extracted ?? 'contents': e.asDict() if e.mimetype else None } for x in local_xmls for e in (exml.ExtractXml(x), ) ] return blob if jobs == 1 or debug: dataset_dict = {} for dataset in dataset_paths: blob = do_xml_metadata(dataset.local, dataset.id) dataset_dict[dataset.id] = blob else: # 3.7 0m25.395s, pypy3 fails iwth unpickling error from joblib import Parallel, delayed from joblib.externals.loky import get_reusable_executor hrm = Parallel(n_jobs=9)( delayed(do_xml_metadata)(dataset.local, dataset.id) for dataset in dataset_paths) get_reusable_executor().shutdown( ) # close the loky executor to clear memory dataset_dict = {d.id: b for d, b in zip(dataset_paths, hrm)} blob_ir = dataset_dict return blob_ir,
def datame(d, ca, timestamp, helpers=None, log_level=logging.INFO, dp=_p, evil=[False], dumb=False): """ sigh, pickles """ log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery', 'augpathlib', 'pyontutils') for log_name in log_names: log = logging.getLogger(log_name) if not log.handlers: log = makeSimpleLogger(log_name) log.setLevel(log_level) log.info(f'{log_name} had no handler') else: if log.level != log_level: log.setLevel(log_level) rc = d.path._cache_class._remote_class if not hasattr(rc, '_cache_anchor'): rc._setup() rc.anchorTo(ca) if not hasattr(BlackfynnCache, '_anchor'): # the fact that we only needed this much later in time # tells me that we had actually done an excellent job # of firewalling the validation pipeline from anything # related to the cache beyond the xatter data # can't use ca.__class__ because it is the posix variant of # _cache_class BlackfynnCache._anchor = ca prp = d.path.project_relative_path if helpers is not None: d.add_helpers(helpers) out_path = (dp / d.id).with_suffix('.json') if out_path.exists() and dumb: if not evil[0]: # FIXME this is SO DUMB to do in here, but ... from pysercomb.pyr import units as pyru [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)] pyru.Term._OntTerm = OntTerm # the tangled web grows ever deeper :x evil[0] = True log.warning(f'loading from path {out_path}') # FIXME this is _idiotically_ slow with joblib # multiple orders of magnitude faster just using listcomp with open(out_path, 'rt') as f: return fromJson(json.load(f)) blob_dataset = d.data_for_export(timestamp) with open(out_path.with_suffix('.raw.json'), 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode) try: pipe = pipes.IrToExportJsonPipeline( blob_dataset) # FIXME network sandbox violation blob_export = pipe.data with open(out_path, 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode) except Exception as e: log.exception(e) log.critical(f'error during fancy json export, see previous log entry') return blob_dataset