Exemple #1
0
    def _ontology_local_repo(self):
        try:
            stated_repo = Path(self.config['ontology_local_repo'])
        except (KeyError, TypeError, FileNotFoundError) as e:
            stated_repo = Path('/dev/null/does-not-exist')

        maybe_repo = self._maybe_repo
        if stated_repo.exists():
            return stated_repo
        elif maybe_repo.exists():
            return maybe_repo
        else:
            maybe_start = Path(__file__).parent.parent.parent.absolute()
            maybe_base = maybe_start
            fsroot = Path('/')
            while maybe_base != fsroot:
                maybe_repo = maybe_base / self.ontology_repo
                if maybe_repo.exists():
                    log.info(
                        tc.blue('INFO:') +
                        f'Ontology repository found at {maybe_repo}')
                    return maybe_repo
                else:
                    maybe_base = maybe_base.parent
            else:
                log.warning(
                    tc.red('WARNING:') +
                    f'No repository found in any parent directory of {maybe_start}'
                )

        return Path('/dev/null/does-not-exist')  # seems reaonsable ...
Exemple #2
0
    def auth(self):
        newline = '\n'
        scopes = list(self._scopes)
        if self.options.debug:
            log.debug(f'requesting for scopes:\n{newline.join(scopes)}')

        service = _get_oauth_service(readonly=self.options.readonly, SCOPES=scopes)
        # FIXME decouple this ...
        log.info(f'Auth finished successfully for scopes:\n{newline.join(scopes)}')
Exemple #3
0
def make_triple(id_, field, value, column_to_predicate=_column_to_predicate):
    if field == 'id':
        if value.startswith('SCR:'):
            value = owl.NamedIndividual
        else:
            log.info(value)
            value = owl.Class
    #if type(value) == bool:
    #if value:
    #value = rdflib.Literal(True)
    #else:
    #value = rdflib.Literal(False)
    return id_, column_to_predicate[field], value
Exemple #4
0
def bootstrap_config():
    if not devconfig.config_file.exists():
        # scigraph api
        maybe_key = get_api_key()
        if maybe_key:
            from pyontutils.scigraph_client import BASEPATH
            devconfig.scigraph_api = BASEPATH
        else:
            devconfig.scigraph_api = devconfig.scigraph_api.default

        # ontology repo
        p1 = Path(__file__).resolve().parent.parent.parent
        p2 = Path(devconfig.git_local_base).resolve().absolute()
        print(p1, p2)
        if (p1 / devconfig.ontology_repo).exists():
            if p1 != p2:
                devconfig.git_local_base = p1
    else:
        log.info(f'config already exists at {devconfig.config_file}')
Exemple #5
0
    def inner(local_filepath, remote=False):
        if noneMembers(local_filepath, *bigleaves) or dobig:
            ext = os.path.splitext(local_filepath)[-1]
            if ext == '.ttl':
                infmt = 'turtle'
            else:
                log.info((ext, local_filepath))
                infmt = None
            if remote:
                resp = requests.get(
                    local_filepath
                )  # TODO nonblocking pull these out, fetch, run inner again until done
                raw = resp.text.encode()
            else:
                try:
                    with open(local_filepath, 'rb') as f:
                        raw = f.read()
                except FileNotFoundError as e:
                    if local_filepath.startswith('file://'):
                        log.info(
                            f'local_imports has already been run, skipping {local_filepath}'
                        )
                        return
                        #raise ValueError('local_imports has already been run') from e
                    else:
                        log.exception(
                            e
                        )  # TODO raise a warning if the file cannot be matched
                        # seems like good practice to have any imported ontology under
                        # version control so all imports are guaranteed to have good
                        # provenance and not split the prior informaiton between the
                        # scigraph config and the repository, the repository remains
                        # the source of truth, load.yaml files can then pick a subset
                        # of the properly tracked files to load as they see fit, but
                        # not add to them (at least in pyontutils land)
                        raw = b''

            if oo in raw:  # we only care if there are imports or an ontology iri
                scratch = OntGraph()
                if infmt == 'turtle':
                    data, rest = raw.split(b'###', 1)
                elif infmt == None:  # assume xml
                    xml_tree = etree.parse(BytesIO(raw))
                    xml_root = xml_tree.getroot()
                    xml_ontology = xml_tree.xpath(
                        "/*[local-name()='RDF']/*[local-name()='Ontology']")
                    xml_root.clear()
                    xml_root.append(xml_ontology[0])
                    data = etree.tostring(xml_root)
                scratch.parse(data=data, format=infmt)
                for s in scratch.subjects(rdf.type, owl.Ontology):
                    triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath)))
                    # somehow this breaks computing the chain
                    #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label):
                    #for o in scratch[s:p]:
                    #triples.add((s, p, o))
                for s, o in sorted(scratch.subject_objects(p)):
                    if revert:
                        raise NotImplementedError('TODO')
                    nlfp = o.replace(remote_base, local_base)
                    triples.add((s, p, o))
                    if 'http://' in local_filepath or 'external' in local_filepath:  # FIXME what to do about https used inconsistently :/
                        if 'external' in local_filepath:
                            imported_iri = rdflib.URIRef(
                                local_filepath.replace(
                                    local_base, remote_base))  # inefficient
                        else:
                            imported_iri = rdflib.URIRef(local_filepath)
                        if s != imported_iri:
                            imported_iri_vs_ontology_iri[
                                imported_iri] = s  # kept for the record
                            triples.add((imported_iri, p,
                                         s))  # bridge imported != ontology iri
                    if local_base in nlfp and 'file://' not in o:  # FIXME file:// should not be slipping through here...
                        scratch.add((s, p, rdflib.URIRef('file://' + nlfp)))
                        scratch.remove((s, p, o))
                    if nlfp not in done:
                        done.append(nlfp)
                        if local_base in nlfp and 'external' not in nlfp:  # skip externals TODO
                            inner(nlfp)
                        elif readonly:  # read external imports
                            if 'external' in nlfp:
                                inner(nlfp)
                            else:
                                inner(nlfp, remote=True)
                if not readonly:
                    _orp = CustomTurtleSerializer.roundtrip_prefixes  # FIXME awful hack :/
                    CustomTurtleSerializer.roundtrip_prefixes = True
                    ttl = scratch.serialize(format='nifttl', encoding='utf-8')
                    CustomTurtleSerializer.roundtrip_prefixes = _orp
                    ndata, comment = ttl.split(b'###', 1)
                    out = ndata + b'###' + rest
                    with open(local_filepath, 'wb') as f:
                        f.write(out)
Exemple #6
0
    def __init__(self,
                 zip_location,
                 git_remote,
                 org,
                 git_local,
                 repo_name,
                 branch,
                 commit,
                 remote_base,
                 load_base,
                 graphload_config_template,
                 graphload_ontologies,
                 patch_config,
                 patch,
                 scigraph_commit,
                 post_clone=lambda: None,
                 fix_imports_only=False,
                 check_built=False):

        date_today = TODAY()

        load_from_repo = True
        local_base = jpth(git_local, repo_name)
        if load_from_repo:
            repo, nob = self._set_up_repo_state(local_base, git_remote, org,
                                                git_local, repo_name, branch,
                                                commit, post_clone)
            ontology_commit = repo.head.object.hexsha[:COMMIT_HASH_HEAD_LEN]
        else:
            ontology_commit = 'NONE'

        config_path, config = self.make_graphload_config(
            graphload_config_template, graphload_ontologies, zip_location,
            date_today)
        config_hash = identity_json(config, sort_lists=True).hex()

        (graph_path, zip_path, zip_command,
         wild_zip_path) = self._set_up_paths(zip_location, repo_name, branch,
                                             scigraph_commit, ontology_commit,
                                             config_hash, date_today)

        # NOTE config is modified in place
        ontologies = self.configure_config(config, graph_path, remote_base,
                                           local_base, config_path)

        load_command = load_base.format(
            config_path=config_path)  # 'exit 1' to test
        log.info(load_command)

        if load_from_repo:
            # replace raw github imports with ontology.neuinfor iris to simplify import chain
            # FIXME this is hardcoded and will not generalize ...
            fix_imports = ("find " + local_base + (
                " -name '*.ttl' -exec sed -i"
                " 's,<http.\+/ttl/,<http://ontology.neuinfo.org/NIF/ttl/,' {} \;"
            ))
            os.system(fix_imports)

        if load_from_repo and not fix_imports_only:

            def reset_state(original_branch=nob):
                repo.git.checkout('--', local_base)
                original_branch.checkout()
        else:
            reset_state = lambda: None

        with execute_regardless(
                reset_state
        ):  # FIXME start this immediately after we obtain nob?
            # main
            if load_from_repo:
                if patch:
                    # FIXME TODO XXX does scigraph load from the catalog!??!??
                    # because it seems like doid loads correctly without using local_versions
                    # which would be cool, if confusing
                    local_versions = tuple(do_patch(patch_config, local_base))
                else:
                    local_versions = tuple()
                itrips = local_imports(
                    remote_base,
                    local_base,
                    ontologies,
                    local_versions=local_versions,
                    dobig=True)  # SciGraph doesn't support catalog.xml
                catalog = make_catalog(itrips)
                with open(Path(local_base, 'catalog.xml'), 'wt') as f:
                    f.write(catalog)
            else:
                itrips = []
                pass

            maybe_zip_path = glob(wild_zip_path)
            if fix_imports_only:
                pass
            elif not maybe_zip_path:
                if check_built:
                    print('The graph has not been loaded.')
                    raise NotBuiltError('The graph has not been loaded.')

                #breakpoint()
                failure = os.system(load_command)
                if failure:
                    if os.path.exists(graph_path):
                        shutil.rmtree(graph_path)
                else:
                    os.rename(
                        config_path,  # save the config for eaiser debugging
                        graph_path / config_path.name)
                    cpr = config_path.with_suffix(config_path.suffix + '.raw')
                    os.rename(cpr, graph_path / cpr.name)
                    failure = os.system(zip_command)  # graphload zip
            else:
                zip_path = maybe_zip_path[0]  # this way we get the actual date
                print('Graph already loaded at', zip_path)

            # this needs to be run when the branch is checked out
            # FIXME might be worth adding this to the load config?
            self.ontologies = [
                get_iri(load_header(rec['url']))
                for rec in config['ontologies']
            ]

        self.zip_path = zip_path
        self.itrips = itrips
        self.config = config
Exemple #7
0
def get_records(user=defaults['--user'],
                host=defaults['--host'],
                port=defaults['--port'],
                database=defaults['--database'],
                field_mapping=_field_mapping):
    DB_URI = 'mysql+{driver}://{user}:{password}@{host}:{port}/{db}'
    config = mysql_conn_helper(host, database, user, port)
    try:
        engine = create_engine(DB_URI.format(driver='mysqlconnector',
                                             **config))
    except ModuleNotFoundError:
        engine = create_engine(DB_URI.format(driver='pymysql', **config))
    config = None  # all weakrefs should be gone by now?
    del (config
         )  # i wonder whether this actually cleans it up when using **config
    insp = inspect(engine)
    #names = [c['name'] for c in insp.get_columns('registry')]
    #resource_columns = [c['name'] for c in insp.get_columns('resource_columns')]
    #resource_data = [c['name'] for c in insp.get_columns('resource_data')]
    #resource_fields = [c['name'] for c in insp.get_columns('resource_fields')]
    #resources = [c['name'] for c in insp.get_columns('resources')]
    #conn.execute('SELECT * from registry;')
    if 1:  # this if for indentation purposes only
        #with engine.connect() as conn:
        conn = engine
        tables = ('resource_columns', 'resource_data', 'resource_fields',
                  'resources')
        data = {
            t: ([c['name'] for c in insp.get_columns(t)],
                conn.execute('SELECT * from %s limit 20;' % t).fetchall())
            for t in tables
        }
        all_fields = [
            n[0] for n in conn.execute(
                'SELECT distinct(name) FROM resource_fields;').fetchall()
        ]

        #query = conn.execute('SELECT r.rid, r.original_id, r.type, rc.name, rc.value from resources as r JOIN'
        #' resource_columns as rc ON r.id=rc.rid'
        #' WHERE rc.name IN %s limit 1000;' % str(tuple([n for n in field_mapping if n != 'MULTI'])))  # XXX DANGER THIS QUERY IS O(x^n) :x
        #' ORDER BY r.rid limit 2000;'

        #query = conn.execute('SELECT r.rid, r.original_id, r.type, rc.name, rc.value from resource_columns as rc JOIN'
        #' resources as r ON rc.rid=r.id'
        #' WHERE rc.name IN %s;' % str(tuple([n for n in field_mapping if n != 'MULTI'])))  # XXX DANGER why does > 2000 limit break stuff?

        #join = query.fetchall()

        #print('running join')
        log.info('running 1')
        r_query = conn.execute(
            'SELECT id, rid, original_id, type, status FROM resources WHERE id >= 0;'
        )  # avoid the various test entries :(
        log.info('fetching 1 ')
        r = r_query.fetchall()
        log.info('running 2')
        rc_query = conn.execute(
            'SELECT rid, name, value, version FROM resource_columns as rc WHERE rc.rid >= 0 AND rc.name IN %s;'
            % str(tuple([n for n in field_mapping if n != 'MULTI'])))
        log.info('fetching 2')
        rc = rc_query.fetchall()

    fixesForResourcesAndColumns(r, rc)
    records = make_records(r, rc, field_mapping)
    log.info('Fetching and data prep done.')
    return records