Ejemplo n.º 1
0
    def _get_protocol_json(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        pi = get_right_id(uri)
        if 'protocols.io' in pi:
            pioid = pi.slug  # FIXME normalize before we ever get here ...
            log.info(pioid)
        else:
            msg = f'protocol uri is not from protocols.io {pi} {self.id}'
            logd.error(msg)
            self.addError(msg)
            return

        #uri_path = uri.rsplit('/', 1)[-1]
        apiuri = 'https://protocols.io/api/v3/protocols/' + pioid
        #'https://www.protocols.io/api/v3/groups/sparc/protocols'
        #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top'
        #print(apiuri, header)
        log.debug('going to network for protocols')
        resp = requests.get(apiuri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
Ejemplo n.º 2
0
    def triples_gen(self):
        rm = self._source

        # FIXME there doesn't seem to be a section that tells me the name
        # of top level model so I have to know its name beforhand
        # the id is in the model, having the id in the resource map
        # prevents issues if these things get sent decoupled
        id = rm['id']
        mid = id.replace(' ', '-')

        links = rm[id]['links']
        #linknodes = [n for n in rm[id]['nodes'] if n['class'] == 'Link']  # visible confusion

        st = []
        from_to = []
        ot = None
        yield from self.apinatbase()
        for link in links:
            if 'conveyingType' in link:
                if link['conveyingType'] == 'ADVECTIVE':
                    p_is = TEMP.isAdvectivelyConnectedTo
                    p_from = TEMP.advectivelyConnectsFrom
                    p_to = TEMP.advectivelyConnectsTo
                    p_cmat = TEMP.advectivelyConnectsMaterial
                    diffusive = False
                elif link['conveyingType'] == 'DIFFUSIVE':
                    p_is = TEMP.isDiffusivelyConnectedTo
                    p_from = TEMP.diffusivelyConnectsFrom
                    p_to = TEMP.diffusivelyConnectsTo
                    p_cmat = TEMP.diffusivelyConnectsMaterial
                    diffusive = True
                else:
                    log.critical(f'unhandled conveying type {link}')
                    continue

                source = link['source']
                target = link['target']
                ok = True
                if len(from_to) == 2:  # otherwise
                    st = []
                    from_to = []
                for i, e in enumerate((source, target)):
                    ed = rm[e]
                    if 'external' not in ed:
                        if not i and from_to:
                            # TODO make sure the intermediate ids match
                            pass
                        else:
                            ok = False
                            break
                    else:
                        st.append(e)
                        from_to.append(OntId(ed['external'][0]))

                conveying = link['conveyingLyph']
                cd = rm[conveying]
                if 'external' in cd:
                    old_ot = ot
                    ot = OntTerm(cd['external'][0])
                    yield ot.u, rdf.type, owl.Class
                    yield ot.u, TEMP.internalId, rdflib.Literal(conveying)
                    yield ot.u, rdfs.label, rdflib.Literal(ot.label)

                    yield from self.materialTriples(
                        ot.u, link, p_cmat)  # FIXME locate this correctly

                    if ok:
                        u, d = from_to
                        if st[0] == source:
                            yield u, rdfs.label, rdflib.Literal(
                                OntTerm(u).label)
                            yield u, rdf.type, owl.Class
                            yield from cmb.restriction.serialize(
                                ot.u, p_from, u)

                        if st[1] == target:
                            yield d, rdfs.label, rdflib.Literal(
                                OntTerm(d).label)
                            yield d, rdf.type, owl.Class
                            yield from cmb.restriction.serialize(ot.u, p_to, d)

                    if old_ot is not None and old_ot != ot:
                        yield from cmb.restriction.serialize(
                            ot.u, p_from, old_ot.u)

                if diffusive:
                    # we can try to hack this using named individuals
                    # but it is not going to do exactly what is desired
                    s_link = TEMP[f'ApiNATOMY/{mid}/{link["id"]}']
                    s_cd = TEMP[f'ApiNATOMY/{mid}/{cd["id"]}']
                    yield s_link, rdf.type, owl.NamedIndividual
                    yield s_link, rdf.type, TEMP.diffusiveLink  # FIXME I'm not sure these go in the model ...
                    yield s_cd, rdf.type, owl.NamedIndividual
                    if 'external' in cd and cd['external']:
                        oid = OntId(cd['external'][0])
                        yield s_cd, rdf.type, oid.u
                        ot = oid.asTerm()
                        if ot.label:
                            yield oid.u, rdfs.label, ot.label

                    else:
                        yield s_cd, rdf.type, TEMP.conveyingLyph
                        for icd in cd['inCoalescences']:
                            dcd = rm[icd]
                            log.info(lj(dcd))
                            s_icd = TEMP[f'ApiNATOMY/{mid}/{dcd["id"]}']
                            yield s_cd, TEMP.partOfCoalescence, s_icd
                            yield s_icd, rdf.type, owl.NamedIndividual
                            yield s_icd, rdf.type, TEMP[
                                'ApiNATOMY/Coalescence']
                            if 'external' in dcd and dcd['external']:
                                oid = OntId(dcd['external'][0])
                                yield s_icd, rdf.type, oid.u
                                ot = oid.asTerm()
                                if ot.label:
                                    yield oid.u, rdfs.label, ot.label

                            for lyphid in dcd['lyphs']:
                                ild = rm[lyphid]
                                log.info(lj(ild))
                                if 'external' in ild and ild['external']:
                                    yield s_icd, TEMP.hasLyphWithMaterial, OntId(
                                        ild['external'][0])

                if not ok:
                    logd.info(f'{source} {target} issue')
                    continue

                for inid, e in zip(st, from_to):
                    yield e.u, rdf.type, owl.Class
                    yield e.u, rdfs.label, rdflib.Literal(OntTerm(e).label)
                    yield e.u, TEMP.internalId, rdflib.Literal(inid)

                f, t = from_to
                yield from cmb.restriction.serialize(f.u, p_is, t.u)
Ejemplo n.º 3
0
 def route_reports_errors_id(id, ext=wrap_tables):
     tables, formatted_title, title = report.errors(id=id)
     log.info(id)
     if tables is None:
         return 'Not found', 404
     return wrap_tables(formatted_title, *tables, title=title)
Ejemplo n.º 4
0
def datame(d,
           ca,
           timestamp,
           helpers=None,
           log_level=logging.INFO,
           dp=_p,
           evil=[False],
           dumb=False):
    """ sigh, pickles """
    log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery',
                 'augpathlib', 'pyontutils')
    for log_name in log_names:
        log = logging.getLogger(log_name)
        if not log.handlers:
            log = makeSimpleLogger(log_name)
            log.setLevel(log_level)
            log.info(f'{log_name} had no handler')
        else:
            if log.level != log_level:
                log.setLevel(log_level)

    rc = d.path._cache_class._remote_class
    if not hasattr(rc, '_cache_anchor'):
        rc._setup()
        rc.anchorTo(ca)

    if not hasattr(BlackfynnCache, '_anchor'):
        # the fact that we only needed this much later in time
        # tells me that we had actually done an excellent job
        # of firewalling the validation pipeline from anything
        # related to the cache beyond the xatter data

        # can't use ca.__class__ because it is the posix variant of # _cache_class
        BlackfynnCache._anchor = ca

    prp = d.path.project_relative_path
    if helpers is not None:
        d.add_helpers(helpers)

    out_path = (dp / d.id).with_suffix('.json')
    if out_path.exists() and dumb:
        if not evil[0]:  # FIXME this is SO DUMB to do in here, but ...
            from pysercomb.pyr import units as pyru
            [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)]
            pyru.Term._OntTerm = OntTerm  # the tangled web grows ever deeper :x
            evil[0] = True

        log.warning(f'loading from path {out_path}')
        # FIXME this is _idiotically_ slow with joblib
        # multiple orders of magnitude faster just using listcomp
        with open(out_path, 'rt') as f:
            return fromJson(json.load(f))

    blob_dataset = d.data_for_export(timestamp)
    with open(out_path.with_suffix('.raw.json'),
              'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
        json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode)

    try:
        pipe = pipes.IrToExportJsonPipeline(
            blob_dataset)  # FIXME network sandbox violation
        blob_export = pipe.data
        with open(out_path, 'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
            json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode)
    except Exception as e:
        log.exception(e)
        log.critical(f'error during fancy json export, see previous log entry')

    return blob_dataset
Ejemplo n.º 5
0
    def validate_path_json_metadata(cls, path_meta_blob):
        from sparcur.core import HasErrors  # FIXME
        he = HasErrors(pipeline_stage=cls.__name__ +
                       '.validate_path_json_metadata')
        mimetypes, suffixes = cls._file_type_status_lookup(
        )  # SIGH this overhead is 2 function calls and a branch
        for i, path_meta in enumerate(path_meta_blob['data']):
            if path_meta['basename'] in cls._banned_basenames:
                msg = f'illegal file detect {path_meta["basename"]}'
                dsrp = path_meta['dataset_relative_path']
                if he.addError(msg, path=dsrp, json_path=('data', i)):
                    logd.error(msg)
                status = 'banned'
                path_meta['status'] = status
                continue

            if 'magic_mimetype' in path_meta and 'mimetype' in path_meta:
                # FIXME NOT clear whether magic_mimetype should be used by itself
                # usually magic and file extension together work, magic by itself
                # can give some completely bonkers results
                source = 'magic_mimetype'
                mimetype = path_meta['magic_mimetype']
                muggle_mimetype = path_meta['mimetype']
                if mimetype != muggle_mimetype:
                    msg = f'mime types do not match {mimetype} != {muggle_mimetype}'
                    dsrp = path_meta['dataset_relative_path']
                    if he.addError(msg, path=dsrp, json_path=('data', i)):
                        log.error(msg)
            elif 'magic_mimetype' in path_meta:
                source = 'magic_mimetype'
                mimetype = path_meta['magic_mimetype']
            elif 'mimetype' in path_meta:
                source = 'mimetype'
                mimetype = path_meta['mimetype']
            else:
                mimetype = None

            if mimetype is not None:
                try:
                    status = mimetypes[mimetype]
                    if status == 'banned':
                        msg = f'banned mimetype detected {mimetype}'
                        dsrp = path_meta['dataset_relative_path']
                        if he.addError(msg,
                                       path=dsrp,
                                       json_path=('data', i, source)):
                            logd.error(msg)
                except KeyError as e:
                    status = 'known'
                    if mimetype not in cls._unclassified_mimes:
                        cls._unclassified_mimes.add(mimetype)
                        log.info(f'unclassified mimetype {mimetype}')
            else:
                status = 'unknown'
                dsrp = path_meta['dataset_relative_path']
                if isinstance(dsrp, str):
                    if not dsrp:
                        msg = f'FIXME top level folder needs a mimetype!'
                    else:
                        msg = f'unknown mimetype {path_meta["basename"]}'
                else:
                    msg = f'unknown mimetype {"".join(dsrp.suffixes)}'
                    cls._unknown_suffixes.add(tuple(dsrp.suffixes))
                if he.addError(msg, path=dsrp, json_path=('data', i)):
                    logd.warning(msg)

            path_meta['status'] = status

        if he._errors_set:
            he.embedErrors(path_meta_blob)
Ejemplo n.º 6
0
    def data(self):
        """ get the 'cached' data which isn't really cached at the moment
            once we implement an index for local files then we can hit that
            first from here """
        # we don't keep two copies of the local data
        # unless we are doing a git-like thing
        if self.is_dir():
            raise TypeError('can\'t retrieve data for a directory')

        meta = self.meta
        if meta.file_id is None:
            raise NotImplementedError('can\'t fetch data without a file id')

        #cands = list(self.local_object_cache_dir.glob(self.cache_key))
        # FIXME this does not play well with old_id ...
        # can probably get away with just globing for the old_id in
        # most cases
        # TODO where to store the chain of prior versions? i.e. do
        # we just keep the xattrs in the object cache? how about file moves?
        # sigh git ...
        rgen = None
        if self.local_object_cache_path.exists():
            locsize = self.local_object_cache_path.size
            if locsize != meta.size:
                msg = (f'Partial download detected {locsize} != {meta.size} at'
                       f'\n{self.local_object_cache_path}')
                log.info(msg)
                size = self.local_object_cache_path.size
                kwargs = {}
                if size > 0:
                    if (self.local == self.local_object_cache_path
                            and size > 4096):  # FIXME hardcoded chunksize
                        # XXX there is a fantastic edge case where if
                        # you try to read and write from the same file
                        # only the first chunk will be written and if
                        # you are retrieving from remote then the offset
                        # would be greater than the chunksize so there
                        # will be a gap, so we set chunksize here and
                        # issue a critical log
                        msg = ('You probably did not mean to do this. '
                               f'Refetching {size - 4096} bytes.')
                        log.critical(msg)
                        kwargs['ranges'] = ((4096, ), )
                    else:
                        kwargs['ranges'] = ((size, ), )

                if not hasattr(self._remote_class, '_api'):
                    # see note below
                    self._remote_class.anchorToCache(self.anchor)

                rgen = self._remote_class.get_file_by_id(
                    meta.id, meta.file_id, **kwargs)
                gen = chain((next(rgen), ), self.local_object_cache_path.data)
            else:
                gen = chain(
                    (f'from local cache {self.local_object_cache_path}', ),
                    self.local_object_cache_path.data)
        else:
            if not hasattr(self._remote_class, '_api'):
                # NOTE we do not want to dereference self.remote
                # in this situation because we just want the file
                # not the FS metadata, so we have to ensure that _api
                # is bound
                self._remote_class.anchorToCache(self.anchor)

            gen = self._remote_class.get_file_by_id(meta.id, meta.file_id)

        try:
            self.data_headers = next(gen)
        except exc.NoRemoteFileWithThatIdError as e:
            log.error(f'{self} {e}')
            raise exc.CacheNotFoundError(
                f'{self}'
            ) from e  # have to raise so that we don't overwrite the file

        log.log(9, self.data_headers)
        if self.local_object_cache_path.exists():
            yield from gen
            if rgen is None:
                return

            yield from self.local_object_cache_path._data_setter(rgen,
                                                                 append=True)

        else:
            # FIXME we MUST write the metadata first so that we know the expected size
            # so that in the event that the generator is only partially run out we know
            # that we can pick up where we left off with the fetch, this also explains
            # why all the cases where the cached data size did not match were missing
            # xattrs entirely
            if not self.local_object_cache_path.parent.exists():
                # FIXME sigh, no obvious way around having to check
                # every time other than creating all the cache
                # subfolders in advance
                self.local_object_cache_path.parent.mkdir()

            self.local_object_cache_path.touch()
            self.local_object_cache_path.cache_init(meta)

            yield from self.local_object_cache_path._data_setter(gen)

        ls = self.local_object_cache_path.size
        if ls != meta.size:
            self.local_object_cache_path.unlink()
            msg = f'{ls} != {meta.size} for {self}'
            raise ValueError(msg)  # FIXME TODO
Ejemplo n.º 7
0
def cleanup_redis(sender, **kwargs):
    """ For the time being ensure that any old data about process
        state is wiped when we restart. """
    log.info('cleaning up old redis connection ...')
    reset_redis_keys(conn)
    populate_existing_redis(conn)
Ejemplo n.º 8
0
        '--no-network': True,  # XXX FIXME we need a way to fetch the data once and then reuse
        '--i-know-what-i-am-doing': True,
        'report': False, 'protocols': False,}  # FIXME separate args for protcur export
options = Options(args, defaults)

project_id = auth.get('remote-organization')
path_source_dir = Path('~/files/sparc-datasets-test').expanduser().resolve()  # FIXME hardcoded  XXX resolve required to avoid mismatches
if not path_source_dir.exists():
    path_source_dir.mkdir(parents=True)

cel = Celery('sparcur-cron',)

cel.conf.worker_hijack_root_logger = False
cel.conf.worker_prefetch_multiplier = 1

log.info(f'STATUS sparcur :id {project_id} :path {path_source_dir}')

# FIXME needed a dedicated worker for the cron queue
cel.conf.task_queues = (
    Queue('cron', Exchange('cron'), routing_key='task.cron',
          #max_priority=100,
          queue_arguments={'x-max-priority': 10},
          ),
    Queue('export', Exchange('export'), routing_key='task.export',
          #max_priority=5,
          queue_arguments={'x-max-priority': 1},
          ),
    Queue('default', Exchange('default'), routing_key='task.default',
          #max_priority=1,
          queue_arguments={'x-max-priority': 1},
          ),  # fallthrough