Example #1
0
    def update_cache(self):
        log.debug(f'maybe updating cache for {self.name}')
        file_is_different = self.cache._meta_updater(self.meta)
        # update the cache first
        # then move to the new name if relevant
        # prevents moving partial metadata onto existing files
        parent_changed = (hasattr(self._bfobject, 'parent')
                          and self._bfobject.parent != self.cache.parent.id)
        if self.cache.name != self.name or parent_changed:  # this is localy correct
            # the issue is that move is now smarter
            # and will detect if a parent path has changed
            try:
                self.cache.move(remote=self)
            except exc.WhyDidntThisGetMovedBeforeError as e:
                # AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
                # deal with the sadness that is non-unique filenames
                # I am 99.999999999999999% certain that users do not
                # expect this behavior ...
                log.error(e)
                if self.bfobject.package.name != self.bfobject.name:
                    argh = self.bfobject.name
                    self.bfobject.name = self.bfobject.package.name
                    try:
                        log.critical(
                            f'Non unique filename :( '
                            f'{self.cache.name} -> {argh} -> {self.bfobject.name}'
                        )
                        self.cache.move(remote=self)
                    finally:
                        self.bfobject.name = argh
                else:
                    raise e

        return file_is_different
Example #2
0
def json_identifier_expansion(obj, *args, path=None, **kwargs):
    """ expand identifiers to json literal form """
    try:
        return _json_identifier_expansion(obj, *args, **kwargs)
    except idlib.exceptions.RemoteError as e:
        oops = json_export_type_converter(obj)
        msg = f'remote error {e} for {type(obj)}: {oops}'
        out = {'id': obj,
               'type': 'identifier',
               'system': obj.__class__.__name__,
               'errors': [{'message': msg, 'path': path}]}
        return out
    except idlib.exceptions.ResolutionError as e:
        oops = json_export_type_converter(obj)
        msg = f'could not resolve {type(obj)}: {oops}'
        out = {'id': obj,
               'type': 'identifier',
               'system': obj.__class__.__name__,
               'errors': [{'message': msg, 'path': path}]}
        return out
    except Exception as e:
        oops = json_export_type_converter(obj)
        msg = f'Unhandled exception {e} in {path}'
        out = {'id': obj,
               'type': 'identifier',
               'system': obj.__class__.__name__,
               'errors': [{'message': msg, 'path': path}]}
        log.critical(msg)
        return out
Example #3
0
    def _protocol_uris_resolved(self):
        # FIXME quite slow ...
        for start_uri in self.protocol_uris:
            log.debug(start_uri)
            try:
                if not hasattr(start_uri, 'dereference'):
                    start_uri = idlib.StreamUri(start_uri)

                end_uri = start_uri.dereference()
                yield end_uri
                sc = end_uri.progenitor.status_code
                if sc > 400:
                    msg = f'error accessing {end_uri} {sc}'
                    if self.addError(msg, blame='submission'):
                        logd.error(msg)

            except idlib.exceptions.ResolutionError as e:
                pass  # FIXME I think we already log this error?
            except self._MissingSchema as e:
                if self.addError(e, blame='submission'):
                    logd.error(e)
            except OntId.BadCurieError as e:
                if self.addError(e, blame='submission'):
                    logd.error(e)
            except BaseException as e:
                #breakpoint()
                log.exception(e)
                log.critical('see exception above')
Example #4
0
    def _name(self):
        name = self.bfobject.name
        if isinstance(self.bfobject, File) and not self.from_packages:
            realname = os.path.basename(self.bfobject.s3_key)
            if name != realname:  # mega weirdness
                if realname.startswith(name):
                    name = realname

                else:
                    realpath = PurePath(realname)
                    namepath = PurePath(name)
                    if namepath.suffixes:
                        log.critical('sigh {namepath!r} -?-> {realpath!r}')

                    else:
                        path = namepath
                        for suffix in realpath.suffixes:
                            path = path.with_suffix(suffix)

                        old_name = name
                        name = path.as_posix()
                        log.info(f'name {old_name} -> {name}')

        if '/' in name:
            bads = ','.join(f'{i}' for i, c in enumerate(name) if c == '/')
            self._errors.append(f'slashes {bads}')
            log.critical(f'GO AWAY {self}')
            name = name.replace('/', '_')
            self.bfobject.name = name  # AND DON'T BOTHER US AGAIN

        return name
Example #5
0
 def _lookup(self, index_column, value, fail=False, raw=True):
     try:
         row = self.byCol.searchIndex(index_column, value, raw=raw)
         return row
     except KeyError as e:
         # TODO update the sheet automatically
         log.critical(f'No match on {index_column} for: {value}')
         if fail:
             raise e
Example #6
0
    def _jm_common(self, do_expensive_operations=False):
        # FIXME WARNING resolution only works if we were relative to
        # the current working directory
        if self.is_broken_symlink():
            self = self.absolute()
        else:
            self = self.resolve()  # safety since we have to go hunting paths

        project_path = self.find_cache_root()

        if project_path is None:
            # FIXME TODO I think we used dataset_description as a hint?
            project_path = self.__class__('/')  # FIXME FIXME
            log.critical(f'No dataset path found for {self}!')
            #raise NotImplementedError('current implementation cant anchor with current info')

        dataset_path = [
            p for p in chain((self, ), self.parents)
            if p.parent == project_path
        ][0]
        drp = self.relative_path_from(dataset_path)  # FIXME ...

        dsid = dataset_path.cache_identifier

        blob = {
            'type': 'path',
            'dataset_id': dsid.curie,
            'dataset_relative_path': drp,
            'basename': self.name,  # for sanity's sake
        }

        mimetype = self.mimetype
        if mimetype:
            blob['mimetype'] = mimetype

        if do_expensive_operations:
            blob['magic_mimetype'] = self._magic_mimetype

        if not (self.is_broken_symlink() or self.exists()):
            # TODO search for closest match
            cands = self._closest_existing_matches()
            msg = f'Path does not exist!\n{self}'
            if cands:
                _fcands = [(r, n) for r, n in cands if r < 10]
                fcands = _fcands if _fcands else cands
                msg += f'\n{0: <4} {self.name}\n'
                msg += '\n'.join([f'{r: <4} {n}' for r, n in fcands])
            # do not log the error here, we won't have
            # enough context to know where we got a bad
            # path, but the caller should, maybe a case for
            # inversion of control here
            blob['errors'] = [{
                'message': msg,
                'candidates': cands,
            }]

        return blob, project_path, dsid
Example #7
0
 def _lookup(self, dataset_id, fail=False, raw=True):
     try:
         row = self.byCol.searchIndex('id', dataset_id, raw=raw)
         return row
     except KeyError as e:
         # TODO update the sheet automatically
         log.critical(f'New dataset! {dataset_id}')
         if fail:
             raise e
Example #8
0
def ret_val_exp(dataset_id, updated, time_now):
    log.info(f'START {dataset_id}')
    did = PennsieveId(dataset_id)
    uid = 'updated-' + dataset_id
    fid = 'failed-' + dataset_id

    # FIXME detect cases where we have already pulled the latest and don't pull again
    # FIXME TODO smart retrieve so we don't pull if we failed during
    # export instead of pull, should be able to get it from the
    # cached metadata on the dataset

    # FIXME getting file exists errors for pull in here
    # in upstream.mkdir()

    # FIXME we need to track/check the state here too in the event
    # that retrieve succeeds but validate or export fails
    # FIXME getting no paths to fetch errors

    # FIXME detect cases where it appears that a new dataset is in the process of being
    # uploaded and don't run for a while if it is being continually modified
    try:
        try:
            p1 = subprocess.Popen(argv_simple_retrieve(dataset_id))
            out1 = p1.communicate()
            if p1.returncode != 0:
                raise Exception(f'oops return code was {p1.returncode}')
        except KeyboardInterrupt as e:
            p1.send_signal(signal.SIGINT)
            raise e

        dataset_path = (path_source_dir / did.uuid / 'dataset').resolve()
        try:
            p2 = subprocess.Popen(argv_spc_find_meta, cwd=dataset_path)
            out2 = p2.communicate()
            if p2.returncode != 0:
                raise Exception(f'oops return code was {p2.returncode}')
        except KeyboardInterrupt as e:
            p2.send_signal(signal.SIGINT)
            raise e

        try:
            p3 = subprocess.Popen(argv_spc_export, cwd=dataset_path)
            out3 = p3.communicate()
            if p3.returncode != 0:
                raise Exception(f'oops return code was {p3.returncode}')
        except KeyboardInterrupt as e:
            p3.send_signal(signal.SIGINT)
            raise e

        conn.set(uid, updated)
        conn.delete(fid)
        log.info(f'DONE: u: {uid} {updated}')
    except Exception as e:
        log.critical(f'FAIL: {fid} {updated}')
        conn.set(fid, updated)
        log.exception(e)
Example #9
0
    def triples(self):
        crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi')
        for blob in self.data['identifier_metadata']:
            id = blob['id']
            if not isinstance(id, idlib.Stream):
                id = idlib.Auto(id)

            if not hasattr(id, 'asUri'):
                breakpoint()

            s = id.asUri(rdflib.URIRef)
            if 'source' in blob:
                source = blob['source']  # FIXME we need to wrap this in our normalized representation
                if source == 'Crossref':  # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl
                    pos = (
                        (rdf.type, owl.NamedIndividual),
                        (rdf.type, TEMP[blob['type']]),
                        (dc.publisher, blob['publisher']),
                        #(dc.type, blob['type']),  # FIXME semantify
                        (dc.title, blob['title']),
                        (dc.date, self.published_online(blob)),  # FIXME .... dangerzone
                    )
                    g = OntGraph()
                    doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id  # FIXME idlib streams need to recognize their own type in __new__
                    data = doi.ttl()
                    if data is None:  # blackfynn has some bad settings on their doi records ...
                        return

                    try:
                        g.parse(data=data, format='ttl')  # FIXME network bad
                    except BaseException as e:
                        loge.exception(e)

                    _tr = [s for s, p, o in g if p == crossref_doi_pred]
                    if _tr:
                        _their_record_s = _tr[0]
                        yield s, owl.sameAs, _their_record_s
                        yield from g
                    else:
                        g.debug()
                        log.critical('No crossref doi section in graph!')
                else:
                    msg = f'dont know what to do with {source}'
                    log.error(msg)
                    #raise NotImplementedError(msg)
                    return
            else:
                msg = f'dont know what to do with {blob} for {id.identifier}'
                log.error(msg)
                #raise NotImplementedError(msg)
                return

            for p, oraw in pos:
                if oraw is not None:
                    o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw
                    yield s, p, o
Example #10
0
    def get_file(package, file_id):
        files = package.files
        if len(files) > 1:
            log.critical(f'MORE THAN ONE FILE IN PACKAGE {package.id}')
        for file in files:
            if file.id == file_id:
                return file

        else:
            raise FileNotFoundError(f'{package} has no file with id {file_id} but has:\n{files}')
Example #11
0
    def bfobject(self):
        if hasattr(self, '_bfobject'):
            return self._bfobject

        if isinstance(self._seed, self.__class__):
            bfobject = self._seed.bfobject

        elif isinstance(self._seed, BaseNode):
            bfobject = self._seed

        elif isinstance(self._seed, str):
            bfobject = self._api.get(self._seed)

        elif isinstance(self._seed, PathMeta):
            bfobject = self._api.get(self._seed.id)

        else:
            raise TypeError(self._seed)

        if hasattr(bfobject, '_json'):
            # constructed from a packages query
            # which we need in order for things to be fastish
            self._bfobject = bfobject
            return self._bfobject

        if isinstance(bfobject, DataPackage):

            def transfer(file, bfobject):
                file.parent = bfobject.parent
                file.dataset = bfobject.dataset
                file.state = bfobject.state
                file.package = bfobject
                return file

            files = bfobject.files
            parent = bfobject.parent
            if files:
                if self._file_id is not None:
                    for file in files:
                        if file.id == self._file_id:
                            bfobject = transfer(file, bfobject)

                elif len(files) > 1:
                    log.critical(f'MORE THAN ONE FILE IN PACKAGE {package.id}')
                else:
                    file = files[0]
                    bfobject = transfer(file, bfobject)

                bfobject.parent = parent  # sometimes we will just reset a parent to itself
            else:
                log.warning(f'No files in package {package.id}')

        self._bfobject = bfobject
        return self._bfobject
Example #12
0
    def institutionTypes(self):
        if 'types' in self.data:
            for t in self.data['types']:
                if t == 'Other':
                    log.info(self.label)

                yield self._type_map[t]

        else:
            log.critical(self.data)
            raise TypeError('wat')
Example #13
0
    def _lookup(self, dataset_id, fail=False, raw=True):
        try:
            row, iv = self._row_from_index('id', dataset_id)
            return row
        except AttributeError as e:
            # TODO update the sheet automatically
            if dataset_id not in self._news:
                log.critical(f'New dataset! {dataset_id}')
                self._news.append(dataset_id)

            if fail:
                raise e
Example #14
0
def dereference_all_identifiers(obj, stage, *args, path=None, addError=None, **kwargs):
    try:
        dict_literal = _json_identifier_expansion(obj)
    except idlib.exc.RemoteError as e:
        if hasattr(obj, '_cooldown'):
            return obj._cooldown()  # trigger cooldown to simplify issues down the line

        error = dict(error=e,
                     pipeline_stage=stage.__class__.__name__,
                     blame='submission',
                     path=tuple(path))

        if addError:
            if addError(**error):
                log.exception(e)
                #logd.error(msg)
        else:
            return {'errors': [error]}

    except idlib.exc.ResolutionError as e:
        if hasattr(obj, '_cooldown'):
            return obj._cooldown()  # trigger cooldown to simplify issues down the line

        oops = json_export_type_converter(obj)
        msg = (f'{stage.lifters.id} could not resolve '  # FIXME lifters sigh
               f'{type(obj)}: {oops} {obj.asUri()}')
        error = dict(error=msg,
                     pipeline_stage=stage.__class__.__name__,
                     blame='submission',
                     path=tuple(path))

        if addError:
            if addError(**error):
                logd.error(msg)
        else:
            return {'errors': [error]}
    except Exception as e:
        log.critical(f'Unhandled exception {e} in {path}')
        error = dict(error=e,
                     pipeline_stage=stage.__class__.__name__,
                     blame='stage',
                     path=tuple(path))

        if addError:
            if addError(**error):
                log.exception(e)
                #logd.error(msg)
        else:
            return {'errors': [error]}
Example #15
0
    def encode(self, field, value):
        if field == 'file_id':
            if not value:
                if value is not None:
                    log.critical(f'{value!r} for file_id empty but not None!')
                value = None

        if value is None:
            return self.empty

        if field in ('errors', ):
            return self.subfieldsep.join(value)

        if field == 'checksum':
            return value.hex()  # raw hex may contain field separators :/

        return _str_encode(field, value)
Example #16
0
    def __new__(cls, cache_anchor, local_class):
        if isinstance(cache_anchor, BlackfynnCache):
            try:
                blackfynn_local_instance = BFLocal(cache_anchor.id)
            except (requests.exceptions.ConnectionError, exc.MissingSecretError) as e:
                log.critical(f'Could not connect to blackfynn {e!r}')
                #blackfynn_local_instance = FakeBFLocal(anchor.id, anchor)  # WARNING polutes things!
                blackfynn_local_instance = 'CONNECTION ERROR'

        else:
            raise TypeError(f'{type(cache_anchor)} is not BFLocal or BlackfynnCache!')

        cache_class = cache_anchor.__class__
        self = super().__new__(cls, local_class, cache_class, _api=blackfynn_local_instance)
        cls._cache_anchor = cache_anchor
        self._errors = []
        self.root = self._api.root
        return self
Example #17
0
    def _jsonMetadata(self, do_expensive_operations=False):
        # FIXME WARNING resolution only works if we were relative to
        # the current working directory
        if self.is_broken_symlink():
            self = self.absolute()
        else:
            self = self.resolve()  # safety since we have to go hunting paths

        project_path = self.find_cache_root()

        if project_path is None:
            # FIXME TODO I think we used dataset_description as a hint?
            project_path = Path('/')  # FIXME FIXME
            log.critical(f'No dataset path found for {self}!')
            #raise NotImplementedError('current implementation cant anchor with current info')

        dataset_path = [
            p for p in chain((self, ), self.parents)
            if p.parent == project_path
        ][0]
        drp = self.relative_path_from(dataset_path)  # FIXME ...

        blob = {
            'type': 'path',
            'dataset_id': dataset_path.cache_id,
            'dataset_relative_path': drp,
        }

        mimetype = self.mimetype
        if mimetype:
            blob['mimetype'] = mimetype

        if do_expensive_operations:
            blob['magic_mimetype'] = asdf

        if not (self.is_broken_symlink() or self.exists()):
            msg = f'Path does not exist! {self}'
            # do not log the error here, we won't have
            # enough context to know where we got a bad
            # path, but the caller should, maybe a case for
            # inversion of control here
            blob['errors'] = [{'message': msg}]

        return blob
Example #18
0
    def __call__(self, affiliation_string):
        if not isinstance(affiliation_string, str):
            logd.critical(str(affiliation_string))
            return self(affiliation_string[0] + 'ERROR ERROR')

        m = self.mapping

        if not isinstance(affiliation_string, str):
            log.critical('sigh')
            return None

        if affiliation_string in m:
            return m[affiliation_string]
        else:
            # FIXME super inefficient
            las = len(affiliation_string)
            for l, s in sorted([(len(k), k) for k in m.keys()], reverse=True):
                if l <= las and s in affiliation_string:
                    return m[s]
Example #19
0
    def published_online(self, blob):
        try:
            dpl = blob['issued']['date-parts']
        except KeyError as e:
            log.critical(e)
            return None

        dp = dpl[0]
        if len(dp) == 3:
            y, m, d = dp
            return f'{y}-{m:0>2}-{d:0>2}'
        elif len(dp) == 2:
            y, m = dp
            return f'{y}-{m:0>2}'
        elif len(dp) == 1:
            y = dp
            return f'{y}'
        else:
            raise NotImplementedError(f'what the? {dp}')
Example #20
0
    def _has_remote_files(self):
        """ this will fetch """
        bfobject = self.bfobject
        if not isinstance(bfobject, DataPackage):
            return False

        files = bfobject.files
        if not files:
            return False

        if len(files) > 1:
            log.critical(f'{self} has more than one file! Not switching bfobject!')
            return True

        file, = files
        file.parent = bfobject.parent
        file.dataset = bfobject.dataset
        file.package = bfobject
        self._bfobject = file
        return True
Example #21
0
 def _lookup(self, dataset_id):
     try:
         return self.byCol.searchIndex('id', dataset_id)
     except KeyError as e:
         # TODO update the sheet automatically
         log.critical(f'New dataset! {dataset_id}')
Example #22
0
    def data(self):
        """ get the 'cached' data which isn't really cached at the moment
            once we implement an index for local files then we can hit that
            first from here """
        # we don't keep two copies of the local data
        # unless we are doing a git-like thing
        if self.is_dir():
            raise TypeError('can\'t retrieve data for a directory')

        meta = self.meta
        if meta.file_id is None:
            raise NotImplementedError('can\'t fetch data without a file id')

        #cands = list(self.local_object_cache_dir.glob(self.cache_key))
        # FIXME this does not play well with old_id ...
        # can probably get away with just globing for the old_id in
        # most cases
        # TODO where to store the chain of prior versions? i.e. do
        # we just keep the xattrs in the object cache? how about file moves?
        # sigh git ...
        rgen = None
        if self.local_object_cache_path.exists():
            locsize = self.local_object_cache_path.size
            if locsize != meta.size:
                msg = (f'Partial download detected {locsize} != {meta.size} at'
                       f'\n{self.local_object_cache_path}')
                log.info(msg)
                size = self.local_object_cache_path.size
                kwargs = {}
                if size > 0:
                    if (self.local == self.local_object_cache_path
                            and size > 4096):  # FIXME hardcoded chunksize
                        # XXX there is a fantastic edge case where if
                        # you try to read and write from the same file
                        # only the first chunk will be written and if
                        # you are retrieving from remote then the offset
                        # would be greater than the chunksize so there
                        # will be a gap, so we set chunksize here and
                        # issue a critical log
                        msg = ('You probably did not mean to do this. '
                               f'Refetching {size - 4096} bytes.')
                        log.critical(msg)
                        kwargs['ranges'] = ((4096, ), )
                    else:
                        kwargs['ranges'] = ((size, ), )

                if not hasattr(self._remote_class, '_api'):
                    # see note below
                    self._remote_class.anchorToCache(self.anchor)

                rgen = self._remote_class.get_file_by_id(
                    meta.id, meta.file_id, **kwargs)
                gen = chain((next(rgen), ), self.local_object_cache_path.data)
            else:
                gen = chain(
                    (f'from local cache {self.local_object_cache_path}', ),
                    self.local_object_cache_path.data)
        else:
            if not hasattr(self._remote_class, '_api'):
                # NOTE we do not want to dereference self.remote
                # in this situation because we just want the file
                # not the FS metadata, so we have to ensure that _api
                # is bound
                self._remote_class.anchorToCache(self.anchor)

            gen = self._remote_class.get_file_by_id(meta.id, meta.file_id)

        try:
            self.data_headers = next(gen)
        except exc.NoRemoteFileWithThatIdError as e:
            log.error(f'{self} {e}')
            raise exc.CacheNotFoundError(
                f'{self}'
            ) from e  # have to raise so that we don't overwrite the file

        log.log(9, self.data_headers)
        if self.local_object_cache_path.exists():
            yield from gen
            if rgen is None:
                return

            yield from self.local_object_cache_path._data_setter(rgen,
                                                                 append=True)

        else:
            # FIXME we MUST write the metadata first so that we know the expected size
            # so that in the event that the generator is only partially run out we know
            # that we can pick up where we left off with the fetch, this also explains
            # why all the cases where the cached data size did not match were missing
            # xattrs entirely
            if not self.local_object_cache_path.parent.exists():
                # FIXME sigh, no obvious way around having to check
                # every time other than creating all the cache
                # subfolders in advance
                self.local_object_cache_path.parent.mkdir()

            self.local_object_cache_path.touch()
            self.local_object_cache_path.cache_init(meta)

            yield from self.local_object_cache_path._data_setter(gen)

        ls = self.local_object_cache_path.size
        if ls != meta.size:
            self.local_object_cache_path.unlink()
            msg = f'{ls} != {meta.size} for {self}'
            raise ValueError(msg)  # FIXME TODO
Example #23
0
    def triples_gen(self):
        rm = self._source

        # FIXME there doesn't seem to be a section that tells me the name
        # of top level model so I have to know its name beforhand
        # the id is in the model, having the id in the resource map
        # prevents issues if these things get sent decoupled
        id = rm['id']
        mid = id.replace(' ', '-')

        links = rm[id]['links']
        #linknodes = [n for n in rm[id]['nodes'] if n['class'] == 'Link']  # visible confusion

        st = []
        from_to = []
        ot = None
        yield from self.apinatbase()
        for link in links:
            if 'conveyingType' in link:
                if link['conveyingType'] == 'ADVECTIVE':
                    p_is = TEMP.isAdvectivelyConnectedTo
                    p_from = TEMP.advectivelyConnectsFrom
                    p_to = TEMP.advectivelyConnectsTo
                    p_cmat = TEMP.advectivelyConnectsMaterial
                    diffusive = False
                elif link['conveyingType'] == 'DIFFUSIVE':
                    p_is = TEMP.isDiffusivelyConnectedTo
                    p_from = TEMP.diffusivelyConnectsFrom
                    p_to = TEMP.diffusivelyConnectsTo
                    p_cmat = TEMP.diffusivelyConnectsMaterial
                    diffusive = True
                else:
                    log.critical(f'unhandled conveying type {link}')
                    continue

                source = link['source']
                target = link['target']
                ok = True
                if len(from_to) == 2:  # otherwise
                    st = []
                    from_to = []
                for i, e in enumerate((source, target)):
                    ed = rm[e]
                    if 'external' not in ed:
                        if not i and from_to:
                            # TODO make sure the intermediate ids match
                            pass
                        else:
                            ok = False
                            break
                    else:
                        st.append(e)
                        from_to.append(OntId(ed['external'][0]))

                conveying = link['conveyingLyph']
                cd = rm[conveying]
                if 'external' in cd:
                    old_ot = ot
                    ot = OntTerm(cd['external'][0])
                    yield ot.u, rdf.type, owl.Class
                    yield ot.u, TEMP.internalId, rdflib.Literal(conveying)
                    yield ot.u, rdfs.label, rdflib.Literal(ot.label)

                    yield from self.materialTriples(
                        ot.u, link, p_cmat)  # FIXME locate this correctly

                    if ok:
                        u, d = from_to
                        if st[0] == source:
                            yield u, rdfs.label, rdflib.Literal(
                                OntTerm(u).label)
                            yield u, rdf.type, owl.Class
                            yield from cmb.restriction.serialize(
                                ot.u, p_from, u)

                        if st[1] == target:
                            yield d, rdfs.label, rdflib.Literal(
                                OntTerm(d).label)
                            yield d, rdf.type, owl.Class
                            yield from cmb.restriction.serialize(ot.u, p_to, d)

                    if old_ot is not None and old_ot != ot:
                        yield from cmb.restriction.serialize(
                            ot.u, p_from, old_ot.u)

                if diffusive:
                    # we can try to hack this using named individuals
                    # but it is not going to do exactly what is desired
                    s_link = TEMP[f'ApiNATOMY/{mid}/{link["id"]}']
                    s_cd = TEMP[f'ApiNATOMY/{mid}/{cd["id"]}']
                    yield s_link, rdf.type, owl.NamedIndividual
                    yield s_link, rdf.type, TEMP.diffusiveLink  # FIXME I'm not sure these go in the model ...
                    yield s_cd, rdf.type, owl.NamedIndividual
                    if 'external' in cd and cd['external']:
                        oid = OntId(cd['external'][0])
                        yield s_cd, rdf.type, oid.u
                        ot = oid.asTerm()
                        if ot.label:
                            yield oid.u, rdfs.label, ot.label

                    else:
                        yield s_cd, rdf.type, TEMP.conveyingLyph
                        for icd in cd['inCoalescences']:
                            dcd = rm[icd]
                            log.info(lj(dcd))
                            s_icd = TEMP[f'ApiNATOMY/{mid}/{dcd["id"]}']
                            yield s_cd, TEMP.partOfCoalescence, s_icd
                            yield s_icd, rdf.type, owl.NamedIndividual
                            yield s_icd, rdf.type, TEMP[
                                'ApiNATOMY/Coalescence']
                            if 'external' in dcd and dcd['external']:
                                oid = OntId(dcd['external'][0])
                                yield s_icd, rdf.type, oid.u
                                ot = oid.asTerm()
                                if ot.label:
                                    yield oid.u, rdfs.label, ot.label

                            for lyphid in dcd['lyphs']:
                                ild = rm[lyphid]
                                log.info(lj(ild))
                                if 'external' in ild and ild['external']:
                                    yield s_icd, TEMP.hasLyphWithMaterial, OntId(
                                        ild['external'][0])

                if not ok:
                    logd.info(f'{source} {target} issue')
                    continue

                for inid, e in zip(st, from_to):
                    yield e.u, rdf.type, owl.Class
                    yield e.u, rdfs.label, rdflib.Literal(OntTerm(e).label)
                    yield e.u, TEMP.internalId, rdflib.Literal(inid)

                f, t = from_to
                yield from cmb.restriction.serialize(f.u, p_is, t.u)
Example #24
0
    def add(data, target_path, value, fail_on_exists=True, update=False):
        """ Note on semantics when target_path contains the type int.
            Normally when adding a path all the parents are added because
            we are expecting a direct path down. However, if the path
            contains int then it implicitly expects the list to alread
            exist. Therefore any failure on the way TO a list will
            immediately abort and not add the keys to the non-existent list.
            This is consistent with the approach where keys are not required
            but if their value is a list it must not be empty. Thus we abort
            so that we don't go around creating a bunch of empty lists that
            will show up later as errors when validating the schema. """
        # type errors can occur here ...
        # e.g. you try to go to a string
        if not [_ for _ in (list, tuple) if isinstance(target_path, _)]:
            msg = f'target_path is not a list or tuple! {type(target_path)}'
            raise TypeError(msg)

        if False and target_path == ['@context', '@base']:
            # use to debug TargetPathExistsError issues
            if '@tracker' not in data:
                data['@tracker'] = []
            try:
                raise BaseException('tracker')
            except BaseException as e:
                data['@tracker'].append(e)

            if '@context' in data and '@base' in data['@context']:
                log.critical(f'target present {data["id"]}')
            else:
                log.critical(f'target not present {data["id"]}')

        target_prefixes = target_path[:-1]
        target_key = target_path[-1]
        target = data
        is_subpath_add = int in target_path
        for i, target_name in enumerate(target_prefixes):
            if target_name is int:  # add same value to all objects in list
                if not is_list_or_tuple(target):
                    msg = (f'attempt to add to all elements of not a list '
                           f'{type(target)} target_path was {target_path} '
                           f'target_name was {target_name}')
                    raise TypeError(msg)
                # LOL PYTHON namespaces
                [AtomicDictOperations.add(subtarget, target_path[i + 1:], value)
                 for subtarget in target]
                return  # int terminates this level of an add

            if target_name not in target:  # TODO list indicies XXX that is really append though ...
                if is_subpath_add:
                    # if we are targeting objects in a list for addition
                    # abort the first time we would have to create a key
                    # because we will eventually create an empty list
                    # which we won't be able to add anything to and will
                    # likely cause schema validation errors
                    return

                target[target_name] = {}

            target = target[target_name]

        if update:
            pass
        elif fail_on_exists and target_key in target:
            msg = f'A value already exists at path {target_path} in\n{lj(data)}'
            raise exc.TargetPathExistsError(msg)

        target[target_key] = value
Example #25
0
def datame(d,
           ca,
           timestamp,
           helpers=None,
           log_level=logging.INFO,
           dp=_p,
           evil=[False],
           dumb=False):
    """ sigh, pickles """
    log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery',
                 'augpathlib', 'pyontutils')
    for log_name in log_names:
        log = logging.getLogger(log_name)
        if not log.handlers:
            log = makeSimpleLogger(log_name)
            log.setLevel(log_level)
            log.info(f'{log_name} had no handler')
        else:
            if log.level != log_level:
                log.setLevel(log_level)

    rc = d.path._cache_class._remote_class
    if not hasattr(rc, '_cache_anchor'):
        rc._setup()
        rc.anchorTo(ca)

    if not hasattr(BlackfynnCache, '_anchor'):
        # the fact that we only needed this much later in time
        # tells me that we had actually done an excellent job
        # of firewalling the validation pipeline from anything
        # related to the cache beyond the xatter data

        # can't use ca.__class__ because it is the posix variant of # _cache_class
        BlackfynnCache._anchor = ca

    prp = d.path.project_relative_path
    if helpers is not None:
        d.add_helpers(helpers)

    out_path = (dp / d.id).with_suffix('.json')
    if out_path.exists() and dumb:
        if not evil[0]:  # FIXME this is SO DUMB to do in here, but ...
            from pysercomb.pyr import units as pyru
            [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)]
            pyru.Term._OntTerm = OntTerm  # the tangled web grows ever deeper :x
            evil[0] = True

        log.warning(f'loading from path {out_path}')
        # FIXME this is _idiotically_ slow with joblib
        # multiple orders of magnitude faster just using listcomp
        with open(out_path, 'rt') as f:
            return fromJson(json.load(f))

    blob_dataset = d.data_for_export(timestamp)
    with open(out_path.with_suffix('.raw.json'),
              'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
        json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode)

    try:
        pipe = pipes.IrToExportJsonPipeline(
            blob_dataset)  # FIXME network sandbox violation
        blob_export = pipe.data
        with open(out_path, 'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
            json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode)
    except Exception as e:
        log.exception(e)
        log.critical(f'error during fancy json export, see previous log entry')

    return blob_dataset
Example #26
0
    def setup(cls, *, local_only=False):
        # FIXME this is a mess
        """ make sure we have all datasources
            calling this again will refresh helpers
        """
        if hasattr(Integrator, '__setup') and Integrator.__setup:
            return  # already setup

        Integrator.__setup = True

        for _cls in cls.mro():
            if _cls != cls:
                if hasattr(_cls, 'setup'):
                    _cls.setup()

        dat.DatasetStructure.rate = cls.rate

        class FakeOrganSheet:
            modality = lambda v: None
            organ_term = lambda v: None
            award_manual = lambda v: None
            byCol = _byCol([['award', 'award_manual', 'organ_term'], []])
            techniques = lambda v: []
            protocol_uris = lambda v: []

        class FakeAffilSheet:
            def __call__(self, *args, **kwargs):
                return

        class FakeOverviewSheet:
            def __call__(self, *args, **kwargs):
                return

        # unanchored helpers
        if cls.no_google or local_only:
            log.critical('no google no organ data')
            cls.organs_sheet = FakeOrganSheet
            cls.affiliations = FakeAffilSheet()
            cls.overview_sheet = FakeOverviewSheet()
        else:
            # ipv6 resolution issues :/ also issues with pickling
            #cls.organs_sheet = sheets.Organs(fetch_grid=True)  # this kills parallelism
            cls.organs_sheet = sheets.Organs(
            )  # if fetch_grid = False @ class level ok
            cls.affiliations = sheets.Affiliations()
            cls.overview_sheet = sheets.Overview()

            # zap all the services (apparently doesn't help)
            # yep, its just the organ sheet, these go in and out just fine
            #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'):
            #delattr(sheets.Sheet, '_Sheet__spreadsheet_service')
            #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'):
            #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro')

            #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet):
            #if hasattr(s, '_spreadsheet_service'):
            #delattr(s, '_spreadsheet_service')

            # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG!
            #cls.organs_sheet = FakeOrganSheet  # organs is BAD

            #cls.affiliations = FakeAffilSheet()  # affiliations is OK
            #cls.overview_sheet = FakeOverviewSheet()  # overview is OK

            #breakpoint()
            # remove byCol which is unpickleable (super duper sigh)
            #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet):
            #if hasattr(s, 'byCol'):
            #delattr(s, 'byCol')

        if cls.no_google:
            cls.organ = lambda award: None

        if local_only:
            cls.organ = lambda award: None
            cls.member = lambda first, last: None
        else:
            cls.organ = OrganData()
            if hasattr(State, 'member'):
                cls.member = State.member
            else:
                log.error('State missing member, using State seems '
                          'like a good idea until you go to multiprocessing')
                cls.member = lambda first, last: None
Example #27
0
    def bfobject(self):
        if hasattr(self, '_bfobject'):
            return self._bfobject

        if isinstance(self._seed, self.__class__):
            bfobject = self._seed.bfobject

        elif isinstance(self._seed, BaseNode):
            bfobject = self._seed

        elif isinstance(self._seed, str):
            try:
                bfobject = self._api.get(self._seed)
            except Exception as e:  # sigh
                if self._local_only:
                    _class = id_to_type(self._seed)
                    if issubclass(_class, Dataset):
                        bfobject = _class(self._local_dataset_name)
                        bfobject.id = self._seed
                    else:
                        raise NotImplementedError(f'{_class}') from e
                else:
                    raise e

        elif isinstance(self._seed, PathMeta):
            bfobject = self._api.get(self._seed.id)

        else:
            raise TypeError(self._seed)

        if hasattr(bfobject, '_json'):
            # constructed from a packages query
            # which we need in order for things to be fastish
            self._bfobject = bfobject
            return self._bfobject

        if isinstance(bfobject, DataPackage):

            def transfer(file, bfobject):
                file.parent = bfobject.parent
                file.dataset = bfobject.dataset
                file.state = bfobject.state
                file.package = bfobject
                return file

            files = bfobject.files
            parent = bfobject.parent
            if files:
                if self._file_id is not None:
                    for file in files:
                        if file.id == self._file_id:
                            bfobject = transfer(file, bfobject)

                elif len(files) > 1:
                    log.critical(
                        f'MORE THAN ONE FILE IN PACKAGE {bfobject.id}')
                    if (len(set(f.size for f in files)) == 1
                            and len(set(f.name for f in files)) == 1):
                        log.critical(
                            'Why are there multiple files with the same name and size here?'
                        )
                        file = files[0]
                        bfobject = transfer(file, bfobject)
                    else:
                        log.critical(
                            f'There are actually multiple files ...\n{files}')

                else:
                    file = files[0]
                    bfobject = transfer(file, bfobject)

                bfobject.parent = parent  # sometimes we will just reset a parent to itself
            else:
                log.warning(f'No files in package {bfobject.id}')

        self._bfobject = bfobject
        return self._bfobject