def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info(uri) pi = get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def triples_gen(self): rm = self._source # FIXME there doesn't seem to be a section that tells me the name # of top level model so I have to know its name beforhand # the id is in the model, having the id in the resource map # prevents issues if these things get sent decoupled id = rm['id'] mid = id.replace(' ', '-') links = rm[id]['links'] #linknodes = [n for n in rm[id]['nodes'] if n['class'] == 'Link'] # visible confusion st = [] from_to = [] ot = None yield from self.apinatbase() for link in links: if 'conveyingType' in link: if link['conveyingType'] == 'ADVECTIVE': p_is = TEMP.isAdvectivelyConnectedTo p_from = TEMP.advectivelyConnectsFrom p_to = TEMP.advectivelyConnectsTo p_cmat = TEMP.advectivelyConnectsMaterial diffusive = False elif link['conveyingType'] == 'DIFFUSIVE': p_is = TEMP.isDiffusivelyConnectedTo p_from = TEMP.diffusivelyConnectsFrom p_to = TEMP.diffusivelyConnectsTo p_cmat = TEMP.diffusivelyConnectsMaterial diffusive = True else: log.critical(f'unhandled conveying type {link}') continue source = link['source'] target = link['target'] ok = True if len(from_to) == 2: # otherwise st = [] from_to = [] for i, e in enumerate((source, target)): ed = rm[e] if 'external' not in ed: if not i and from_to: # TODO make sure the intermediate ids match pass else: ok = False break else: st.append(e) from_to.append(OntId(ed['external'][0])) conveying = link['conveyingLyph'] cd = rm[conveying] if 'external' in cd: old_ot = ot ot = OntTerm(cd['external'][0]) yield ot.u, rdf.type, owl.Class yield ot.u, TEMP.internalId, rdflib.Literal(conveying) yield ot.u, rdfs.label, rdflib.Literal(ot.label) yield from self.materialTriples( ot.u, link, p_cmat) # FIXME locate this correctly if ok: u, d = from_to if st[0] == source: yield u, rdfs.label, rdflib.Literal( OntTerm(u).label) yield u, rdf.type, owl.Class yield from cmb.restriction.serialize( ot.u, p_from, u) if st[1] == target: yield d, rdfs.label, rdflib.Literal( OntTerm(d).label) yield d, rdf.type, owl.Class yield from cmb.restriction.serialize(ot.u, p_to, d) if old_ot is not None and old_ot != ot: yield from cmb.restriction.serialize( ot.u, p_from, old_ot.u) if diffusive: # we can try to hack this using named individuals # but it is not going to do exactly what is desired s_link = TEMP[f'ApiNATOMY/{mid}/{link["id"]}'] s_cd = TEMP[f'ApiNATOMY/{mid}/{cd["id"]}'] yield s_link, rdf.type, owl.NamedIndividual yield s_link, rdf.type, TEMP.diffusiveLink # FIXME I'm not sure these go in the model ... yield s_cd, rdf.type, owl.NamedIndividual if 'external' in cd and cd['external']: oid = OntId(cd['external'][0]) yield s_cd, rdf.type, oid.u ot = oid.asTerm() if ot.label: yield oid.u, rdfs.label, ot.label else: yield s_cd, rdf.type, TEMP.conveyingLyph for icd in cd['inCoalescences']: dcd = rm[icd] log.info(lj(dcd)) s_icd = TEMP[f'ApiNATOMY/{mid}/{dcd["id"]}'] yield s_cd, TEMP.partOfCoalescence, s_icd yield s_icd, rdf.type, owl.NamedIndividual yield s_icd, rdf.type, TEMP[ 'ApiNATOMY/Coalescence'] if 'external' in dcd and dcd['external']: oid = OntId(dcd['external'][0]) yield s_icd, rdf.type, oid.u ot = oid.asTerm() if ot.label: yield oid.u, rdfs.label, ot.label for lyphid in dcd['lyphs']: ild = rm[lyphid] log.info(lj(ild)) if 'external' in ild and ild['external']: yield s_icd, TEMP.hasLyphWithMaterial, OntId( ild['external'][0]) if not ok: logd.info(f'{source} {target} issue') continue for inid, e in zip(st, from_to): yield e.u, rdf.type, owl.Class yield e.u, rdfs.label, rdflib.Literal(OntTerm(e).label) yield e.u, TEMP.internalId, rdflib.Literal(inid) f, t = from_to yield from cmb.restriction.serialize(f.u, p_is, t.u)
def route_reports_errors_id(id, ext=wrap_tables): tables, formatted_title, title = report.errors(id=id) log.info(id) if tables is None: return 'Not found', 404 return wrap_tables(formatted_title, *tables, title=title)
def datame(d, ca, timestamp, helpers=None, log_level=logging.INFO, dp=_p, evil=[False], dumb=False): """ sigh, pickles """ log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery', 'augpathlib', 'pyontutils') for log_name in log_names: log = logging.getLogger(log_name) if not log.handlers: log = makeSimpleLogger(log_name) log.setLevel(log_level) log.info(f'{log_name} had no handler') else: if log.level != log_level: log.setLevel(log_level) rc = d.path._cache_class._remote_class if not hasattr(rc, '_cache_anchor'): rc._setup() rc.anchorTo(ca) if not hasattr(BlackfynnCache, '_anchor'): # the fact that we only needed this much later in time # tells me that we had actually done an excellent job # of firewalling the validation pipeline from anything # related to the cache beyond the xatter data # can't use ca.__class__ because it is the posix variant of # _cache_class BlackfynnCache._anchor = ca prp = d.path.project_relative_path if helpers is not None: d.add_helpers(helpers) out_path = (dp / d.id).with_suffix('.json') if out_path.exists() and dumb: if not evil[0]: # FIXME this is SO DUMB to do in here, but ... from pysercomb.pyr import units as pyru [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)] pyru.Term._OntTerm = OntTerm # the tangled web grows ever deeper :x evil[0] = True log.warning(f'loading from path {out_path}') # FIXME this is _idiotically_ slow with joblib # multiple orders of magnitude faster just using listcomp with open(out_path, 'rt') as f: return fromJson(json.load(f)) blob_dataset = d.data_for_export(timestamp) with open(out_path.with_suffix('.raw.json'), 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode) try: pipe = pipes.IrToExportJsonPipeline( blob_dataset) # FIXME network sandbox violation blob_export = pipe.data with open(out_path, 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode) except Exception as e: log.exception(e) log.critical(f'error during fancy json export, see previous log entry') return blob_dataset
def validate_path_json_metadata(cls, path_meta_blob): from sparcur.core import HasErrors # FIXME he = HasErrors(pipeline_stage=cls.__name__ + '.validate_path_json_metadata') mimetypes, suffixes = cls._file_type_status_lookup( ) # SIGH this overhead is 2 function calls and a branch for i, path_meta in enumerate(path_meta_blob['data']): if path_meta['basename'] in cls._banned_basenames: msg = f'illegal file detect {path_meta["basename"]}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i)): logd.error(msg) status = 'banned' path_meta['status'] = status continue if 'magic_mimetype' in path_meta and 'mimetype' in path_meta: # FIXME NOT clear whether magic_mimetype should be used by itself # usually magic and file extension together work, magic by itself # can give some completely bonkers results source = 'magic_mimetype' mimetype = path_meta['magic_mimetype'] muggle_mimetype = path_meta['mimetype'] if mimetype != muggle_mimetype: msg = f'mime types do not match {mimetype} != {muggle_mimetype}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i)): log.error(msg) elif 'magic_mimetype' in path_meta: source = 'magic_mimetype' mimetype = path_meta['magic_mimetype'] elif 'mimetype' in path_meta: source = 'mimetype' mimetype = path_meta['mimetype'] else: mimetype = None if mimetype is not None: try: status = mimetypes[mimetype] if status == 'banned': msg = f'banned mimetype detected {mimetype}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i, source)): logd.error(msg) except KeyError as e: status = 'known' if mimetype not in cls._unclassified_mimes: cls._unclassified_mimes.add(mimetype) log.info(f'unclassified mimetype {mimetype}') else: status = 'unknown' dsrp = path_meta['dataset_relative_path'] if isinstance(dsrp, str): if not dsrp: msg = f'FIXME top level folder needs a mimetype!' else: msg = f'unknown mimetype {path_meta["basename"]}' else: msg = f'unknown mimetype {"".join(dsrp.suffixes)}' cls._unknown_suffixes.add(tuple(dsrp.suffixes)) if he.addError(msg, path=dsrp, json_path=('data', i)): logd.warning(msg) path_meta['status'] = status if he._errors_set: he.embedErrors(path_meta_blob)
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... rgen = None if self.local_object_cache_path.exists(): locsize = self.local_object_cache_path.size if locsize != meta.size: msg = (f'Partial download detected {locsize} != {meta.size} at' f'\n{self.local_object_cache_path}') log.info(msg) size = self.local_object_cache_path.size kwargs = {} if size > 0: if (self.local == self.local_object_cache_path and size > 4096): # FIXME hardcoded chunksize # XXX there is a fantastic edge case where if # you try to read and write from the same file # only the first chunk will be written and if # you are retrieving from remote then the offset # would be greater than the chunksize so there # will be a gap, so we set chunksize here and # issue a critical log msg = ('You probably did not mean to do this. ' f'Refetching {size - 4096} bytes.') log.critical(msg) kwargs['ranges'] = ((4096, ), ) else: kwargs['ranges'] = ((size, ), ) if not hasattr(self._remote_class, '_api'): # see note below self._remote_class.anchorToCache(self.anchor) rgen = self._remote_class.get_file_by_id( meta.id, meta.file_id, **kwargs) gen = chain((next(rgen), ), self.local_object_cache_path.data) else: gen = chain( (f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: if not hasattr(self._remote_class, '_api'): # NOTE we do not want to dereference self.remote # in this situation because we just want the file # not the FS metadata, so we have to ensure that _api # is bound self._remote_class.anchorToCache(self.anchor) gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise exc.CacheNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.log(9, self.data_headers) if self.local_object_cache_path.exists(): yield from gen if rgen is None: return yield from self.local_object_cache_path._data_setter(rgen, append=True) else: # FIXME we MUST write the metadata first so that we know the expected size # so that in the event that the generator is only partially run out we know # that we can pick up where we left off with the fetch, this also explains # why all the cases where the cached data size did not match were missing # xattrs entirely if not self.local_object_cache_path.parent.exists(): # FIXME sigh, no obvious way around having to check # every time other than creating all the cache # subfolders in advance self.local_object_cache_path.parent.mkdir() self.local_object_cache_path.touch() self.local_object_cache_path.cache_init(meta) yield from self.local_object_cache_path._data_setter(gen) ls = self.local_object_cache_path.size if ls != meta.size: self.local_object_cache_path.unlink() msg = f'{ls} != {meta.size} for {self}' raise ValueError(msg) # FIXME TODO
def cleanup_redis(sender, **kwargs): """ For the time being ensure that any old data about process state is wiped when we restart. """ log.info('cleaning up old redis connection ...') reset_redis_keys(conn) populate_existing_redis(conn)
'--no-network': True, # XXX FIXME we need a way to fetch the data once and then reuse '--i-know-what-i-am-doing': True, 'report': False, 'protocols': False,} # FIXME separate args for protcur export options = Options(args, defaults) project_id = auth.get('remote-organization') path_source_dir = Path('~/files/sparc-datasets-test').expanduser().resolve() # FIXME hardcoded XXX resolve required to avoid mismatches if not path_source_dir.exists(): path_source_dir.mkdir(parents=True) cel = Celery('sparcur-cron',) cel.conf.worker_hijack_root_logger = False cel.conf.worker_prefetch_multiplier = 1 log.info(f'STATUS sparcur :id {project_id} :path {path_source_dir}') # FIXME needed a dedicated worker for the cron queue cel.conf.task_queues = ( Queue('cron', Exchange('cron'), routing_key='task.cron', #max_priority=100, queue_arguments={'x-max-priority': 10}, ), Queue('export', Exchange('export'), routing_key='task.export', #max_priority=5, queue_arguments={'x-max-priority': 1}, ), Queue('default', Exchange('default'), routing_key='task.default', #max_priority=1, queue_arguments={'x-max-priority': 1}, ), # fallthrough