Example #1
0
    def latest_ir(self):
        if not self.__class__._pyru_loaded:
            self.__class__._pyru_loaded = True
            from pysercomb.pyr import units as pyru
            [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)]
            pyru.Term._OntTerm = OntTerm  # the tangled web grows ever deeper :x

        return super().latest_ir
Example #2
0
            # fact that the remote fileid is not known, but we are inside
            # the code that actually makes that update ...
            # and the absense of a staging area / history makes the cache.meta
            # the only place we can do this besides the object store, especially
            # if we lack the ability to retrieve checksums for single files (sigh)

            # we already have the latest data (ignoring concurency)
            remote.update_cache(
                cache=self.cache,
                fetch=False)  # FIXME fetch=False => different diff rule

        return remote


Path._bind_flavours()
register_type(Path, 'path')


class StashCache(BlackfynnCache):
    def remote(self):
        return None

    def _meta_setter(self, value):
        if not self.meta:
            super()._meta_setter(value)
        else:
            raise TypeError('you dont want to set a stashed cache')


class StashPath(Path):
    _cache_class = StashCache
Example #3
0
                if element in ('oneOf', 'anyOf', 'allOf'):
                    next(gen)
                    continue

                if element == 'properties':
                    yield next(gen)
                elif element == 'items':
                    # NOTE it is ok to use int in cases like this because even
                    # though it is in principle ambiguous with a case where
                    # someone happens to be using the python function int
                    # as a key in a dict they would never be able to serialize
                    # that to json without some major conversions, so I am
                    # ruling that it is safe to lift to type here since this
                    # is JPointer not rando dict pointer
                    yield int

            except StopIteration:
                break

    @classmethod
    def fromList(cls, iterable):
        return cls('#/' + '/'.join(iterable))
        
    def asList(self):
        return self.split('/')


# register idlib classes for fromJson
[register_type(i, i.__name__) for i in
 (idlib.Ror, idlib.Doi, idlib.Orcid, idlib.Pio, idlib.Rrid, OntTerm)]
Example #4
0
class ExportXml(ExportBase):
    """ convert and export metadata embedded in xml files """

    export_type = 'filetype'
    filename_json = 'xml-export.json'

    def export(self, dataset_paths=tuple(), **kwargs):
        return super().export(dataset_paths=dataset_paths, **kwargs)

    def export_other_formats(self, *args, **kwargs):
        pass

    register_type(None,
                  'all-xml-files')  # FIXME VERY BAD TO NEED TO CALL THIS HERE

    def make_ir(self, dataset_paths=tuple(), jobs=None, debug=False):
        from sparcur.extract import xml as exml

        def do_xml_metadata(local, id):  # FIXME HACK needs its own pipeline
            local_xmls = list(local.rglob('*.xml'))
            missing = [p.as_posix() for p in local_xmls if not p.exists()]
            if missing:
                oops = "\n".join(missing)
                raise BaseException(f'unfetched children\n{oops}')

            blob = {
                'type':
                'all-xml-files',  # FIXME not quite correct use of type here
                'dataset_id': id,
                'xml': tuple()
            }
            blob['xml'] = [
                {
                    'path': x.relative_to(local).as_posix(),
                    'type': 'path',
                    'mimetype':
                    e.mimetype,  # FIXME should this in the extracted ??
                    'contents': e.asDict() if e.mimetype else None
                } for x in local_xmls for e in (exml.ExtractXml(x), )
            ]

            return blob

        if jobs == 1 or debug:
            dataset_dict = {}
            for dataset in dataset_paths:
                blob = do_xml_metadata(dataset.local, dataset.id)
                dataset_dict[dataset.id] = blob
        else:
            # 3.7 0m25.395s, pypy3 fails iwth unpickling error
            from joblib import Parallel, delayed
            from joblib.externals.loky import get_reusable_executor
            hrm = Parallel(n_jobs=9)(
                delayed(do_xml_metadata)(dataset.local, dataset.id)
                for dataset in dataset_paths)
            get_reusable_executor().shutdown(
            )  # close the loky executor to clear memory
            dataset_dict = {d.id: b for d, b in zip(dataset_paths, hrm)}

        blob_ir = dataset_dict
        return blob_ir,
Example #5
0
def datame(d,
           ca,
           timestamp,
           helpers=None,
           log_level=logging.INFO,
           dp=_p,
           evil=[False],
           dumb=False):
    """ sigh, pickles """
    log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery',
                 'augpathlib', 'pyontutils')
    for log_name in log_names:
        log = logging.getLogger(log_name)
        if not log.handlers:
            log = makeSimpleLogger(log_name)
            log.setLevel(log_level)
            log.info(f'{log_name} had no handler')
        else:
            if log.level != log_level:
                log.setLevel(log_level)

    rc = d.path._cache_class._remote_class
    if not hasattr(rc, '_cache_anchor'):
        rc._setup()
        rc.anchorTo(ca)

    if not hasattr(BlackfynnCache, '_anchor'):
        # the fact that we only needed this much later in time
        # tells me that we had actually done an excellent job
        # of firewalling the validation pipeline from anything
        # related to the cache beyond the xatter data

        # can't use ca.__class__ because it is the posix variant of # _cache_class
        BlackfynnCache._anchor = ca

    prp = d.path.project_relative_path
    if helpers is not None:
        d.add_helpers(helpers)

    out_path = (dp / d.id).with_suffix('.json')
    if out_path.exists() and dumb:
        if not evil[0]:  # FIXME this is SO DUMB to do in here, but ...
            from pysercomb.pyr import units as pyru
            [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)]
            pyru.Term._OntTerm = OntTerm  # the tangled web grows ever deeper :x
            evil[0] = True

        log.warning(f'loading from path {out_path}')
        # FIXME this is _idiotically_ slow with joblib
        # multiple orders of magnitude faster just using listcomp
        with open(out_path, 'rt') as f:
            return fromJson(json.load(f))

    blob_dataset = d.data_for_export(timestamp)
    with open(out_path.with_suffix('.raw.json'),
              'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
        json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode)

    try:
        pipe = pipes.IrToExportJsonPipeline(
            blob_dataset)  # FIXME network sandbox violation
        blob_export = pipe.data
        with open(out_path, 'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
            json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode)
    except Exception as e:
        log.exception(e)
        log.critical(f'error during fancy json export, see previous log entry')

    return blob_dataset