def curation_export_published(export_path, out_base=None): p = Path(export_path).expanduser().resolve() ce = OntResPath(p / 'curation-export.ttl') orps = [ OntResPath(_) for _ in (p / 'datasets').children if _.suffix == '.ttl' ] graphs = [o.graph for o in orps] merged = _populate_published(ce, graphs) op = p if out_base is None else Path(out_base) merged.write(op / 'curation-export-published.ttl')
def setUp(self): Path._cache_class = BlackfynnCache self.project_path = Path(project_path) # FIXME common overwrites? self.anchor = self.project_path.cache BlackfynnCache.setup(Path, BlackfynnRemoteFactory) self.anchor.remote # trigger creation of _remote_class self.BlackfynnRemote = BlackfynnCache._remote_class
def __new__(cls, cache_anchor, local_class, host): # TODO decouple _new from init here as well session = pxssh.pxssh(options=dict( IdentityAgent=os.environ.get('SSH_AUTH_SOCK'))) session.login(host, ssh_config=Path('~/.ssh/config').expanduser().as_posix()) cls._rows = 200 cls._cols = 200 session.setwinsize(cls._rows, cls._cols) # prevent linewraps of long commands session.prompt() atexit.register(lambda: (session.sendeof(), session.close())) cache_class = cache_anchor.__class__ newcls = super().__new__(cls, local_class, cache_class, host=host, session=session) newcls._uid, *newcls._gids = [ int(i) for i in ( newcls._ssh('echo $(id -u) $(id -G)').decode().split(' ')) ] newcls._cache_anchor = cache_anchor # must run before we can get the sysid, which is a bit odd # given that we don't actually sandbox the filesystem newcls._bind_sysid() return newcls
def __init__(self, path): self._errors = [] if isinstance(path, str): path = Path(path) if not hasattr(self, 'path'): self.path = path
def populate_existing_redis(conn): """ Set the initial state for exports from the file system. """ # we intentionally do not go to network here because that will # be done by check_for_updates datasets_export_base = Path(options.export_path) / 'datasets' uuids = [c.name for c in datasets_export_base.children if c.is_dir()] for uuid in uuids: dataset_id = 'N:dataset:' + uuid try: # catch potentially malformed ids did = PennsieveId(dataset_id) except idlib.exc.MalformedIdentifierError as e: log.error(f'strange dir in dataset export: {uuid}\n{e}') continue # FIXME hardcoded convention latest = (datasets_export_base / uuid / 'LATEST' / 'curation-export.json') if latest.exists(): with open(latest, 'rt') as f: # we don't bother to use fromJson here because we just # need the raw values not the sparcur ir blob = json.load(f) updated = blob['meta']['timestamp_updated'] #prov_commit = blob['prov']['commit'] # TODO need to be able to detect software changes and rerun sid = 'state-' + dataset_id uid = 'updated-' + dataset_id fid = 'failed-' + dataset_id conn.set(sid, _none) conn.set(uid, updated) conn.set(fid, '') log.info(pprint.pformat({k:conn.get(k) for k in sorted(conn.keys()) if b'N:dataset' in k}, width=120))
def run_reasoner(self): graph = self._mis_graph() expanded_graph = self._mis_graph() [(graph.add(t), expanded_graph.add(t)) for t in self.triples()] closure = rdfc.OWLRL_Semantics rdfc.DeductiveClosure(closure).expand(expanded_graph) with open(Path(config.cache_dir, 'reasoned-curation-export.ttl'), 'wb') as f: f.write(expanded_graph.serialize(format='nifttl'))
def __init__( self, export_path, export_source_path, folder_timestamp, timestamp, latest=False, partial=False, open_when_done=False, org_id=None, export_protcur_base=None, export_base=None, ): if org_id is None: self.export_source_path = export_source_path id = export_source_path.cache.anchor.identifier.uuid else: # do not set export_source_path, to prevent accidental export id = BlackfynnId(org_id).uuid self.export_path = Path(export_path) self.export_base = (export_base if export_base is not None else Path( export_path, id, self.export_type)) self.latest = latest self.partial = partial self.folder_timestamp = folder_timestamp self.timestamp = timestamp self.open_when_done = open_when_done self.export_protcur_base = export_protcur_base # pass in as export_base self._args = dict( export_path=export_path, export_source_path=export_source_path, folder_timestamp=folder_timestamp, timestamp=timestamp, latest=latest, partial=partial, open_when_done=open_when_done, org_id=org_id, export_protcur_base=export_protcur_base, export_base=export_base, )
def _mis_graph(self): """ for now easier to just get a fresh one, they are small """ glb = pauth.get_path('git-local-base') olr = Path(glb / 'duplicates' / 'sparc-NIF-Ontology') graph = (rdflib.ConjunctiveGraph() .parse((olr / 'ttl/sparc-methods.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods-core.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods-helper.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods.ttl').as_posix(), format='turtle') ) return graph
def _mis_graph(self): """ for now easier to just get a fresh one, they are small """ olr = Path( devconfig.git_local_base) / 'duplicates' / 'sparc-NIF-Ontology' graph = ( rdflib.ConjunctiveGraph().parse( (olr / 'ttl/sparc-methods.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods-core.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods-helper.ttl').as_posix(), format='turtle') #.parse((olr / 'ttl/methods.ttl').as_posix(), format='turtle') ) return graph
def setUp(self): super().setUp(init_cache=False) hostname = gethostname() SshCache._local_class = Path Path.setup(SshCache, SshRemoteFactory ) # if this doesn't break something I will be surprised project_path = Path(self.test_path) self.project_path = project_path remote_root = PurePath(Path(__file__).parent) # the 'remote' target remote_id = remote_root.as_posix() anchor = project_path.cache_init( remote_id, anchor=True ) # this_folder.meta is sort of one extra level of host keys # FIXME remote_root doesn't actually work for ssh remotes, it is always '/' #anchor = project_path.cache_init('/') # this_folder.meta is sort of one extra level of host keys try: self.SshRemote = SshRemoteFactory(anchor, Path, hostname) except TypeError: # pxssh fail self.SshRemote = SshRemoteFactory(anchor, Path, hostname + '-local') self.this_file = Path(__file__) self.this_file_darkly = self.SshRemote(__file__) tfd_cache = self.this_file_darkly.cache_init()
def test_meta(self): #hrm = this_file_darkly.meta.__dict__, this_file_darkly.local.meta.__dict__ #assert hrm[0] == hrm[1] rm = self.this_file_darkly.meta lm = Path(__file__).meta rmnid = {k: v for k, v in rm.items() if k != 'id'} lmnid = {k: v for k, v in lm.items() if k != 'id'} bads = [] for k, rv in rmnid.items(): lv = lmnid[k] if rv != lv: bads.append((lv, rv)) assert not bads, bads
def __init__( self, export_path, export_source_path, folder_timestamp, timestamp, latest=False, partial=False, open_when_done=False, ): self.export_source_path = export_source_path self.export_base = Path(export_path, export_source_path.cache.anchor.id) self.latest = latest self.partial = partial self.folder_timestamp = folder_timestamp self.timestamp = timestamp self.open_when_done = open_when_done
def __init__(self, export_path, export_source_path, folder_timestamp, timestamp, latest=False, partial=False, open_when_done=False, org_id=None): if org_id is None: self.export_source_path = export_source_path id = export_source_path.cache.anchor.id else: # do not set export_source_path, to prevent accidental export id = org_id self.export_base = Path(export_path, id, self.export_type) self.latest = latest self.partial = partial self.folder_timestamp = folder_timestamp self.timestamp = timestamp self.open_when_done = open_when_done
self.queued = False self.fetching = False self.exporting = False self.last_export_failed = None defaults = {o.name:o.value if o.argcount else None for o in parse_defaults(clidoc)} args = {**defaults, 'export': True, '--jobs': 1, 'schemas': False, 'protcur': False, '--no-network': True, # XXX FIXME we need a way to fetch the data once and then reuse '--i-know-what-i-am-doing': True, 'report': False, 'protocols': False,} # FIXME separate args for protcur export options = Options(args, defaults) project_id = auth.get('remote-organization') path_source_dir = Path('~/files/sparc-datasets-test').expanduser().resolve() # FIXME hardcoded XXX resolve required to avoid mismatches if not path_source_dir.exists(): path_source_dir.mkdir(parents=True) cel = Celery('sparcur-cron',) cel.conf.worker_hijack_root_logger = False cel.conf.worker_prefetch_multiplier = 1 log.info(f'STATUS sparcur :id {project_id} :path {path_source_dir}') # FIXME needed a dedicated worker for the cron queue cel.conf.task_queues = ( Queue('cron', Exchange('cron'), routing_key='task.cron', #max_priority=100, queue_arguments={'x-max-priority': 10},
def write_graphs(sgs, path=None): if path is None: path = Path(tempfile.tempdir) / 'protcur-individual' if not path.exists(): path.mkdir() pp = path / 'published' if not pp.exists(): pp.mkdir() hpath = path / 'html' if not hpath.exists(): hpath.mkdir() hpp = hpath / 'published' if not hpp.exists(): hpp.mkdir() opath = path / 'org' if not opath.exists(): opath.mkdir() opp = opath / 'published' if not opp.exists(): opp.mkdir() for wg in sgs: u = next(wg[:rdf.type:sparc.Protocol]) published = bool(list(wg[u:TEMP.datasetPublishedDoi:])) try: pid = idlib.Pio(u) base = 'pio-' + pid.identifier.suffix except idlib.exc.IdlibError as e: pid = None base = (u .replace('http://', '') .replace('https://', '') .replace('/', '_') .replace('.', '_')) name = base + '.ttl' hname = base + '.html' oname = base + '.org' if published: wt_path = pp / name wh_path = hpp / hname wo_path = opp / oname else: wt_path = path / name wh_path = hpath / hname wo_path = opath / oname wg.write(wt_path) write_html(wg, wh_path) if pid is None: org = None else: #if wo_path.exists(): continue # XXX remove after testing complete try: org = pid.asOrg() except idlib.exc.IdlibError as e: org = None if org is not None: with open(wo_path, 'wt') as f: f.write(org)
import shutil from pathlib import PurePosixPath from datetime import datetime from sparcur import config from sparcur import exceptions as exc from sparcur.paths import Path from sparcur.paths import LocalPath, PrimaryCache, RemotePath from sparcur.paths import XattrCache, SymlinkCache from sparcur.state import State from sparcur.pathmeta import PathMeta from sparcur.datasets import Version1Header from sparcur.curation import PathData, Integrator from sparcur.blackfynn_api import FakeBFLocal this_file = Path(__file__) template_root = this_file.parent.parent / 'resources/DatasetTemplate' print(template_root) project_path = this_file.parent / 'test_local/test_project' test_organization = 'N:organization:ba06d66e-9b03-4e3d-95a8-649c30682d2d' test_dataset = 'N:dataset:5d167ba6-b918-4f21-b23d-cdb124780da1' PathData.project_path = project_path osk = Version1Header.skip_cols # save original skips Version1Header.skip_cols = tuple( _ for _ in osk if _ != 'example') # use the example values for tests ds_folders = 'ds1', 'ds2', 'ds3', 'ds4' ds_roots = ( 'ds1', 'ds2/ds2', 'ds3/oops',
import os import shutil from pathlib import PurePosixPath from datetime import datetime from augpathlib import PathMeta from augpathlib.utils import onerror_windows_readwrite_remove from sparcur import config from sparcur import exceptions as exc from sparcur.paths import Path from sparcur.paths import LocalPath, PrimaryCache, RemotePath from sparcur.paths import SymlinkCache from sparcur.state import State from sparcur.datasets import Version1Header from sparcur.curation import PathData, Integrator from sparcur.blackfynn_api import FakeBFLocal this_file = Path(__file__) template_root = this_file.parent.parent / 'resources/DatasetTemplate' print(template_root) project_path = this_file.parent / 'test_local/test_project' fake_organization = 'N:organization:fake-organization-id' project_path_real = this_file.parent / 'test_local/UCSD' test_organization = 'N:organization:ba06d66e-9b03-4e3d-95a8-649c30682d2d' test_dataset = 'N:dataset:5d167ba6-b918-4f21-b23d-cdb124780da1' onerror = onerror_windows_readwrite_remove if os.name == 'nt' else None osk = Version1Header.skip_cols # save original skips Version1Header.skip_cols = tuple(_ for _ in osk if _ != 'example') # use the example values for tests ds_roots = ( 'ds1',
from tempfile import gettempdir from pathlib import PurePosixPath from datetime import datetime import pytest from augpathlib import PathMeta from augpathlib.utils import onerror_windows_readwrite_remove from sparcur import config from sparcur import exceptions as exc from sparcur.paths import Path from sparcur.paths import LocalPath, PrimaryCache from sparcur.paths import SymlinkCache from sparcur.state import State from sparcur.datasets import DatasetDescriptionFile from sparcur.curation import PathData, Integrator from sparcur.blackfynn_api import FakeBFLocal this_file = Path(__file__).resolve() # ARGH PYTHON ARGH NO LOL BAD PYTHON examples_root = this_file.parent / 'examples' template_root = this_file.parent.parent / 'resources/DatasetTemplate' print(template_root) _pid = os.getpid() path_project_container = this_file.parent / f'test_local-{_pid}' project_path = path_project_container / 'test_project' fake_organization = 'N:organization:fake-organization-id' project_path_real = path_project_container / 'UCSD' test_organization = 'N:organization:ba06d66e-9b03-4e3d-95a8-649c30682d2d' test_dataset = 'N:dataset:5d167ba6-b918-4f21-b23d-cdb124780da1' temp_path = Path(gettempdir(), f'.sparcur-testing-base-{_pid}') onerror = onerror_windows_readwrite_remove if os.name == 'nt' else None SKIP_NETWORK = ('SKIP_NETWORK' in os.environ or 'FEATURES' in os.environ
def title(self): path = Path(self.path) return f'{path.name} {path.cache.dataset.name[:30]} ...'
class OrganData: """ retrieve SPARC investigator data """ url = ('https://commonfund.nih.gov/sites/default/' 'files/sparc_nervous_system_graphic/main.html') def organ(self, award_number): if award_number in self.manual and award_number not in self.sourced: log.warning(f'used manual organ mapping for {award_number}') try: return self.award_to_organ[award_number] except KeyError as e: logd.error(f'bad award_number {award_number}') __call__ = organ organ_lookup = { 'bladder': OntId('FMA:15900'), 'brain': OntId('UBERON:0000955'), #'computer': OntId(''), 'heart': OntId('FMA:7088'), 'kidneys': OntId('FMA:7203'), 'largeintestine': OntId('FMA:7201'), 'liver': OntId('FMA:7197'), 'lung': OntId('FMA:7195'), 'malerepro': OntId('UBERON:0000079'), #'othertargets': OntId(''), 'pancreas': OntId('FMA:7198'), 'smallintestine': OntId('FMA:7200'), 'spleen': OntId('FMA:7196'), 'stomach': OntId('FMA:7148'), 'vagus nerve': OntId('FMA:5731'), #'uterus': OntId('') '': None, } cache = Path(config.cache_dir, 'sparc-award-by-organ.json') old_cache = Path(config.cache_dir, 'award-mappings-old-to-new.json') def __init__(self, path=config.organ_html_path, organs_sheet=None): # FIXME bad passing in organs self.path = path if not self.cache.exists(): self.overview() with open(self.cache, 'wt') as f: json.dump(self.normalized, f) with open(self.old_cache, 'wt') as f: json.dump(self.former_to_current, f) else: with open(self.cache, 'rt') as f: self.normalized = json.load(f) with open(self.old_cache, 'rt') as f: self.former_to_current = json.load(f) if organs_sheet is not None: self._org = organs_sheet bc = self._org.byCol self.manual = { award if award else (award_manual if award_manual else None): [OntId(t) for t in organ_term.split(' ') if t] for award, award_manual, organ_term in zip( bc.award, bc.award_manual, bc.organ_term) if organ_term } else: self.manual = {} self.sourced = {v: k for k, vs in self.normalized.items() for v in vs} self.award_to_organ = { **self.sourced, **self.manual } # manual override def overview(self): if self.path.exists(): with open(self.path, 'rb') as f: soup = BeautifulSoup(f.read(), 'lxml') else: resp = requests.get(self.url) soup = BeautifulSoup(resp.content, 'lxml') self.raw = {} self.former_to_current = {} for bsoup in soup.find_all( 'div', {'id': lambda v: v and v.endswith('-bubble')}): organ, _ = bsoup['id'].split('-') award_list = self.raw[organ] = [] for asoup in bsoup.find_all('a'): href = asoup['href'] log.debug(href) parts = urlparse(href) query = parse_qs(parts.query) if 'projectnumber' in query: award_list.extend(query['projectnumber']) elif 'aid' in query: #aid = [int(a) for a in query['aid']] #json = self.reporter(aid) award, former = self.reporter(href) award_list.append(award) if former is not None: award_list.append( former) # for this usecase this is ok self.former_to_current[former] = award elif query: log.debug(lj(query)) self.former_to_current = { nml.NormAward(nml.NormAward(k)): nml.NormAward(nml.NormAward(v)) for k, v in self.former_to_current.items() } self._normalized = {} self.normalized = {} for frm, to in ((self.raw, self._normalized), (self._normalized, self.normalized)): for organ, awards in frm.items(): if organ in self.organ_lookup: organ = self.organ_lookup[organ].iri to[organ] = [nml.NormAward(a) for a in awards] def _reporter(self, aids): # can't seem to get this to cooperate base = ('https://api.federalreporter.nih.gov' '/v1/projects/FetchBySmApplIds') resp = requests.post(base, json=aids, headers={ 'Accept': 'application/json', 'Content-Type': 'application/json' }) breakpoint() return resp.json() def reporter(self, href): resp = requests.get(href) soup = BeautifulSoup(resp.content, 'lxml') #id = soup.find_all('span', {'id': 'spnPNUMB'}) table = soup.find_all('table', {'summary': 'Details'}) text = table[0].find_all('td')[1].text.strip() if 'Former' in text: award, rest = text.split(' ', 1) rest, former = text.rsplit(' ', 1) return [award, former] else: return [text, None]
class ProtocolData(dat.HasErrors): # this class is best used as a helper class not as a __call__ class def __init__(self, id=None): # FIXME lots of ways to use this class ... self.id = id # still needed for the converters use case :/ super().__init__(pipeline_stage=self.__class__) def protocol(self, uri): return self._get_protocol_json(uri) __call__ = protocol @classmethod def setup(cls, creds_file=None): if creds_file is None: try: creds_file = devconfig.secrets('protocols-io', 'api', 'creds-file') except KeyError as e: raise TypeError('creds_file is a required argument' ' unless you have it in secrets') from e _pio_creds = get_protocols_io_auth(creds_file) cls._pio_header = QuietDict( {'Authorization': 'Bearer ' + _pio_creds.access_token}) @classmethod def cache_path(cls): return config.protocol_cache_path @property def protocol_uris_resolved(self): if not hasattr(self, '_c_protocol_uris_resolved'): self._c_protocol_uris_resolved = list(self._protocol_uris_resolved) return self._c_protocol_uris_resolved @property def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) for end_uri in resolution_chain(start_uri): pass else: yield end_uri @property def protocol_annotations(self): for uri in self.protocol_uris_resolved: yield from protc.byIri(uri, prefix=True) @property def protocol_jsons(self): for uri in self.protocol_uris_resolved: yield self._get_protocol_json(uri) @cache(Path(config.cache_dir, 'protocol_json')) def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info(uri) pi = get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def setUp(self): self.ds = [dat.DatasetStructureLax(p) for p in Path(project_path).children]
return self._data_cache @hasSchema.f(sc.SummarySchema, fail=True) def data(self, timestamp=None): data = self._pipeline_end(timestamp) return data # FIXME we want objects that wrap the output rather than generate it ... @hasSchema.f(sc.SummarySchema, fail=True) def data_for_export(self, timestamp): data = self._pipeline_end(timestamp) # NOTE this timestamps the cached data AS INTENDED data['prov']['timestamp_export_start'] = timestamp return data _p = Path(tempfile.gettempdir()) / 'asdf' _p.mkdir(exist_ok=True) # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXX def datame(d, ca, timestamp, helpers=None, log_level=logging.INFO, dp=_p, evil=[False], dumb=False): """ sigh, pickles """ log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery', 'augpathlib', 'pyontutils') for log_name in log_names:
import shutil from tempfile import gettempdir from pathlib import PurePosixPath from datetime import datetime from augpathlib import PathMeta from augpathlib.utils import onerror_windows_readwrite_remove from sparcur import config from sparcur import exceptions as exc from sparcur.paths import Path from sparcur.paths import LocalPath, PrimaryCache, RemotePath from sparcur.paths import SymlinkCache from sparcur.state import State from sparcur.datasets import DatasetDescriptionFile from sparcur.curation import PathData, Integrator from sparcur.blackfynn_api import FakeBFLocal this_file = Path(__file__) examples_root = this_file.parent / 'examples' template_root = this_file.parent.parent / 'resources/DatasetTemplate' print(template_root) project_path = this_file.parent / 'test_local/test_project' fake_organization = 'N:organization:fake-organization-id' project_path_real = this_file.parent / 'test_local/UCSD' test_organization = 'N:organization:ba06d66e-9b03-4e3d-95a8-649c30682d2d' test_dataset = 'N:dataset:5d167ba6-b918-4f21-b23d-cdb124780da1' temp_path = Path(gettempdir(), f'.sparcur-testing-base-{os.getpid()}') onerror = onerror_windows_readwrite_remove if os.name == 'nt' else None ddih = DatasetDescriptionFile.ignore_header # save original skips DatasetDescriptionFile.ignore_header = tuple( _ for _ in ddih if _ != 'example') # use the example values for tests