コード例 #1
0
]  # Additional files that should be scanned can be added here

# Look at all Turtle files
inputFiles = [
    path.join(root, name) for root, dirs, files in walk(ttlFolder)
    for name in files if name.endswith((".ttl"))
]

# Add additional files
inputFiles.extend(additionalFiles)

# Identify LOC identifiers by looking at triples where the object is a LCO identifier
locIdentifiers = []
for file in tqdm(inputFiles):
    if file.endswith('.trig'):
        g = rdflib.Dataset()
        g.parse(file)
        queryResults = g.query("""SELECT DISTINCT ?loc WHERE {
            GRAPH ?g {
                ?s ?p ?loc .
            }
            FILTER(REGEX(STR(?loc),"http://id.loc.gov/"))
        }""")
    else:
        g = rdflib.Graph()
        g.parse(file)
        queryResults = g.query("""SELECT DISTINCT ?loc WHERE {
            ?s ?p ?loc .
            FILTER(REGEX(STR(?loc),"http://id.loc.gov/"))
        }""")
    for row in queryResults:
コード例 #2
0
ファイル: apirdflib.py プロジェクト: rishitakalyani/schemaorg
from testharness import *
from sdoutil import *
import api
from apimarkdown import Markdown
import StringIO

rdflib.plugin.register("json-ld", Parser, "rdflib_jsonld.parser",
                       "JsonLDParser")
rdflib.plugin.register("json-ld", Serializer, "rdflib_jsonld.serializer",
                       "JsonLDSerializer")

ATTIC = 'attic'
VOCAB = None
VOCABLEN = 0
ALTVOCAB = "https://schema.org"
STORE = rdflib.Dataset()
#Namespace mapping#############
nss = {'core': 'http://schema.org/'}
revNss = {}
NSSLoaded = False
allLayersList = []

context_data = "data/internal-context"  #Local file containing context to be used loding .jsonld files

RDFLIBLOCK = threading.Lock()  #rdflib uses generators which are not threadsafe

from rdflib.namespace import RDFS, RDF, OWL
SCHEMA = rdflib.Namespace('http://schema.org/')

QUERYGRAPH = None
コード例 #3
0
ファイル: exportgraphs.py プロジェクト: lambda2/schemaorg
store = getMasterStore()
read_schemas(loadExtensions=True)
read_extensions(sdoapp.ENABLED_EXTENSIONS)
graphs = list(store.graphs())

from rdflib.namespace import RDFS


def MdComments(g):  #Process Markdown
    for s, p, o in list(g.triples((None, RDFS.comment, None))):
        no = MD.parse(o)  #g.remove((s,p,o))
        g.set((s, p, Literal(no)))


outGraph = rdflib.Dataset()
simpleFormat = False
if args.format == "xml" or args.format == "nt" or args.format == "turtle":
    simpleFormat = True
    outGraph = rdflib.Graph()

gs = sorted(list(store.graphs()), key=lambda u: u.identifier)

for g in gs:  #Put core first
    if str(g.identifier) == "http://schema.org/":
        gs.remove(g)
        gs.insert(0, g)
        break

for g in gs:
    id = str(g.identifier)
コード例 #4
0
def main():
    """Main entry point for the dfs CLI."""
    args = docopt(__doc__, version=__version__)
    csvfile = args["FILE"]

    PROV = Namespace("http://www.w3.org/ns/prov#")
    QUDT = Namespace("http://qudt.org/schema/qudt#")
    UNIT = Namespace("http://qudt.org/1.1/vocab/unit#")
    COMPONENT = Namespace("http://crc.nd.edu/schema/component#")

    ds = rdflib.Dataset(default_union=True)
    ds.bind("prov", PROV)
    ds.bind("qudt", QUDT)
    ds.bind("component", COMPONENT)

    with open(csvfile, 'rb') as f:
        reader = csv.reader(f)
        # id[0], namegbxml[1], namearch[2], iswindow[3],
        # thickness[4], embodiedenergy[5], eeunit_id[6],
        # matdensityarch[7], matdensitygbxml[8], densityunit_id[9],
        # unitcostmat[10], unitcostmle[11], unitcostttl[12],
        # financialunit_id[13], lifeexpectancy[14], maintenancefactor[15]
        # infosource[16], confidence[17]
        for row in reader:
            # generate new uuid for component
            componentid = 'urn:green-matdb:' + str(uuid.uuid4())
            ds.add((URIRef(componentid), RDF.type, COMPONENT.Component))
            ds.add((URIRef(componentid), COMPONENT.gbxmlname, Literal(row[1])))
            ds.add((URIRef(componentid), COMPONENT.archname, Literal(row[2])))
            # Check to see if this guy is a window
            if (row[3] == '1'):
                ds.add((URIRef(componentid), RDF.type, COMPONENT.Window))
            # Check to see if we have a thickness
            # thicknessid = 'urn:green-matdb:' + str(uuid.uuid4())
            thicknessid = BNode()
            ds.add((URIRef(componentid), COMPONENT.hasThickness, thicknessid))
            ds.add((thicknessid, RDF.type, QUDT.QuantityValue))
            ds.add((thicknessid, QUDT.numericValue,
                    Literal(row[4], datatype=XSD.float)))
            ds.add((thicknessid, QUDT.unit, UNIT.Inch))
            embodiedenergy = BNode()
            ds.add((URIRef(componentid), COMPONENT.hasEmbodiedEnergy,
                    embodiedenergy))
            ds.add((embodiedenergy, RDF.type, QUDT.QuantityValue))
            ds.add((embodiedenergy, QUDT.numericValue,
                    Literal(row[5], datatype=XSD.float)))
            if (row[6] == '1'):
                ds.add((embodiedenergy, QUDT.unit, UNIT.BtuPerPound))
            elif (row[6] == '2'):
                # This QUDT unit doesn't exist. Unit is JoulePerKilogram.
                # Need to create new derived unit.
                ds.add((embodiedenergy, QUDT.unit, UNIT.MegaJoulePerKilogram))
            materialdensityArch = BNode()
            ds.add((URIRef(componentid), COMPONENT.hasMaterialDensity,
                    materialdensityArch))
            ds.add((materialdensityArch, COMPONENT.hasSource,
                    COMPONENT.Archsource))
            ds.add((materialdensityArch, RDF.type, QUDT.QuantityValue))
            ds.add((materialdensityArch, QUDT.numericValue,
                    Literal(row[7], datatype=XSD.float)))
            if (row[9] == '1'):
                ds.add((materialdensityArch, QUDT.unit,
                        UNIT.KilogramPerCubicMeter))
            elif (row[9] == '2'):
                ds.add(
                    (materialdensityArch, QUDT.unit, UNIT.PoundPerCubicFoot))
            materialdensitygbxml = BNode()
            ds.add((URIRef(componentid), COMPONENT.hasMaterialDensity,
                    materialdensitygbxml))
            ds.add((materialdensityArch, COMPONENT.hasSource,
                    COMPONENT.gbxmlsource))
            ds.add((materialdensitygbxml, RDF.type, QUDT.QuantityValue))
            ds.add((materialdensitygbxml, QUDT.numericValue,
                    Literal(row[8], datatype=XSD.float)))
            if (row[9] == '1'):
                ds.add((materialdensitygbxml, QUDT.unit,
                        UNIT.KilogramPerCubicMeter))
            elif (row[9] == '2'):
                ds.add(
                    (materialdensitygbxml, QUDT.unit, UNIT.PoundPerCubicFoot))
            unitcostmat = BNode()
            ds.add((URIRef(componentid), COMPONENT.hasUnitCost, unitcostmat))
            ds.add((unitcostmat, RDF.type, QUDT.QuantityValue))
            ds.add((unitcostmat, QUDT.numericValue,
                    Literal(row[10], datatype=XSD.float)))
            unitcostMLE = BNode()

    print(ds.serialize(format="turtle"))
コード例 #5
0
ファイル: clone.py プロジェクト: RDFLib/pySHACL
def mix_datasets(base_ds: ConjunctiveLike,
                 extra_ds: GraphLike,
                 target_ds: Optional[Union[ConjunctiveLike, str]] = None):
    """
    Make a clone of base_ds (dataset) and add in the triples from extra_ds (dataset)
    :param base_ds:
    :type base_ds: rdflib.Dataset
    :param extra_ds:
    :type extra_ds: rdflib.Dataset
    :param target_ds:
    :type target_ds: rdflib.Dataset|str|NoneType
    :return: The cloned Dataset with mixed in triples from extra_ds
    :rtype: rdflib.Dataset
    """
    default_union = base_ds.default_union
    base_named_graphs = list(base_ds.contexts())
    if target_ds is None:
        target_ds = rdflib.Dataset(default_union=default_union)
    elif isinstance(target_ds, rdflib.ConjunctiveGraph):
        raise RuntimeError(
            "Cannot mix new graphs into a ConjunctiveGraph, use Dataset instead."
        )
    elif target_ds == "inplace":
        pass  # do nothing here
    elif not isinstance(target_ds, rdflib.Dataset):
        raise RuntimeError(
            "Cannot mix datasets if target_ds passed in is not a Dataset itself."
        )
    if isinstance(extra_ds, (rdflib.Dataset, rdflib.ConjunctiveGraph)):
        mixin_graphs = list(extra_ds.contexts())
    else:
        mixin_graphs = [extra_ds]
    if target_ds == "inplace":
        target_ds = base_ds
        for mg in mixin_graphs:
            mod_named_graphs = {
                g.identifier: mix_graphs(g, mg, target_graph="inplace")
                for g in base_named_graphs
            }
    elif isinstance(target_ds, str):
        raise RuntimeError(
            "target_ds cannot be a string (unless it is 'inplace')")
    else:

        mixed_graphs = {}
        for mg in mixin_graphs:
            mod_named_graphs = {
                g.identifier:
                mix_graphs(g,
                           mg,
                           target_graph=rdflib.Graph(store=target_ds.store,
                                                     identifier=g.identifier))
                for g in base_named_graphs
            }
            mixed_graphs.update(mod_named_graphs)
        default_context_id = target_ds.default_context.identifier
        for i, m in mixed_graphs.items():
            if i == default_context_id:
                target_ds.store.remove_graph(target_ds.default_context)
                target_ds.default_context = m
            target_ds.add_graph(m)
    return target_ds
コード例 #6
0
ファイル: load.py プロジェクト: RDFLib/pySHACL
def load_from_source(
    source: Union[GraphLike, BufferedIOBase, TextIOBase, BinaryIO,
                  Union[str, bytes]],
    g: Optional[GraphLike] = None,
    rdf_format: Optional[str] = None,
    multigraph: bool = False,
    do_owl_imports: Union[bool, int] = False,
    import_chain: Optional[List[Union[rdflib.URIRef, str]]] = None,
):
    """

    :param source:
    :param g:
    :type g: rdflib.Graph | None
    :param rdf_format:
    :type rdf_format: str
    :param multigraph:
    :type multigraph: bool
    :param do_owl_imports:
    :type do_owl_imports: bool|int
    :param import_chain:
    :type import_chain: list | None
    :return:
    """
    source_is_graph = False
    open_source: Optional[Union[BufferedIOBase, BinaryIO]] = None
    source_was_open: bool = False
    source_as_file: Optional[Union[BufferedIOBase, BinaryIO]] = None
    source_as_filename: Optional[str] = None
    source_as_bytes: Optional[bytes] = None
    filename = None
    public_id = None
    uri_prefix = None
    is_imported_graph = do_owl_imports and isinstance(
        do_owl_imports, int) and do_owl_imports > 1
    if isinstance(source,
                  (rdflib.Graph, rdflib.ConjunctiveGraph, rdflib.Dataset)):
        source_is_graph = True
        if g is None:
            g = source
        else:
            raise RuntimeError(
                "Cannot pass in both target=rdflib.Graph/Dataset and g=graph.")
    elif isinstance(source, (BufferedIOBase, TextIOBase)):
        if hasattr(source, 'name'):
            filename = source.name  # type: ignore
            public_id = Path(filename).resolve().as_uri() + "#"
        if isinstance(source, TextIOBase):
            buf = getattr(source, "buffer")  # type: BufferedIOBase
            source_as_file = source = buf
        else:
            source_as_file = source
        if hasattr(source, 'closed'):
            if not bool(source.closed):
                open_source = source
                source_was_open = True
        else:
            # Assume it is open now and it was open when we started.
            open_source = source
            source_was_open = True

    elif isinstance(source, str):
        pid = os.getpid()
        fd0 = "/proc/{}/fd/0".format(str(pid))
        if is_windows and source.startswith('file:///'):
            public_id = source
            filename = source[8:]
            source_as_filename = filename
        elif not is_windows and source.startswith('file://'):
            public_id = source
            filename = source[7:]
            source_as_filename = filename
        elif source.startswith('http:') or source.startswith('https:'):
            public_id = source
            try:
                resp, rdf_format = get_rdf_from_web(source)
            except HTTPError:
                if is_imported_graph:
                    return g
                else:
                    raise
            if rdf_format == 'graph':
                source = resp
                source_is_graph = True
            else:
                filename = resp.geturl()
                fp = resp.fp  # type: BufferedIOBase
                source_was_open = False
                source = open_source = fp
        else:
            first_char = source[0]
            if is_windows and (first_char == '\\' or
                               (len(source) > 3 and source[1:3] == ":\\")):
                filename = source
                source_as_filename = filename
            elif first_char == '/' or (len(source) > 2
                                       and source[0:2] == "./"):
                filename = source
                source_as_filename = filename
            elif (first_char == '#' or first_char == '@' or first_char == '<'
                  or first_char == '\n' or first_char == '{'
                  or first_char == '['):
                # Contains some JSON or XML or Turtle chars, it's not a path
                source_as_file = None
                source_as_filename = None
            elif len(source) >= 32 and '\n' in source[:32]:
                # Contains a new line near the start of the file, can't be a path
                source_as_file = None
                source_as_filename = None
            elif len(source) < 140:
                filename = source
                source_as_filename = filename
        if source_as_filename and filename:
            if filename == "stdin" or filename == "/dev/stdin" or filename == "-" or filename == fd0:
                source = source_as_file = open_source = sys.stdin.buffer
                source_was_open = True
            else:
                try:
                    filename = os.readlink(filename)
                    if filename == fd0 or filename == "/dev/stdin":
                        source = source_as_file = open_source = sys.stdin.buffer
                        source_was_open = True
                except OSError:
                    pass
        # TODO: Do we still need this? Not sure why this was added, but works better without it
        #  if public_id and not public_id.endswith('#'):
        #     public_id = "{}#".format(public_id)
        if not source_as_file and not source_as_filename and not open_source and isinstance(
                source, str):
            # source is raw RDF data.
            source_as_bytes = source = source.encode('utf-8')
    elif isinstance(source, bytes):
        if source.startswith(b'file:') or source.startswith(
                b'http:') or source.startswith(b'https:'):
            raise ValueError(
                "file:// and http:// strings should be given as str, not bytes."
            )
        first_char_b: bytes = source[0:1]
        if (first_char_b == b'#' or first_char_b == b'@'
                or first_char_b == b'<' or first_char_b == b'\n'
                or first_char_b == b'{' or first_char_b == b'['):
            # Contains some JSON or XML or Turtle stuff
            source_as_file = None
            source_as_filename = None
        elif len(source) < 140:
            filename = source.decode('utf-8')
            source_as_filename = filename
        if not source_as_file and not source_as_filename and not open_source:
            source_as_bytes = source
    else:
        raise ValueError("Cannot determine the format of the input graph")
    if g is None:
        if source_is_graph:
            target_g: Union[rdflib.Graph, rdflib.ConjunctiveGraph,
                            rdflib.Dataset] = source  # type: ignore
        else:
            target_g = rdflib.Dataset() if multigraph else rdflib.Graph()
    else:
        if not isinstance(
                g, (rdflib.Graph, rdflib.Dataset, rdflib.ConjunctiveGraph)):
            raise RuntimeError(
                "Passing in 'g' must be a rdflib Graph or Dataset.")
        target_g = g

    if filename:
        if filename.endswith('.ttl'):
            rdf_format = rdf_format or 'turtle'
        elif filename.endswith('.nt'):
            rdf_format = rdf_format or 'nt'
        elif filename.endswith('.n3'):
            rdf_format = rdf_format or 'n3'
        elif filename.endswith('.json'):
            rdf_format = rdf_format or 'json-ld'
        elif filename.endswith('.nq') or filename.endswith('.nquads'):
            rdf_format = rdf_format or 'nquads'
        elif filename.endswith('.trig'):
            rdf_format = rdf_format or 'trig'
        elif filename.endswith('.xml') or filename.endswith('.rdf'):
            rdf_format = rdf_format or 'xml'
    if source_as_filename and filename is not None and not open_source:
        filename = str(Path(filename).resolve())
        if not public_id:
            public_id = Path(filename).as_uri() + "#"
        source = open_source = open(filename, mode='rb')
    if not open_source and source_as_bytes:
        source = open_source = BytesIO(source_as_bytes)  # type: ignore
    if open_source:
        _source = open_source
        # Check if we can seek
        try:
            _source.seek(0)  # type: ignore
        except (AttributeError, UnsupportedOperation):
            # Read it all into memory
            new_bytes = BytesIO(_source.read())
            if not source_was_open:
                _source.close()
            source = _source = new_bytes
            source_was_open = False
        if rdf_format is None:
            line = _source.readline().lstrip()
            if len(line) > 15:
                line = line[:15]
            line = line.lower()
            if line.startswith(b"<!doctype html") or line.startswith(b"<html"):
                raise RuntimeError("Attempted to load a HTML document as RDF.")
            if line.startswith(b"<?xml") or line.startswith(
                    b"<xml") or line.startswith(b"<rdf:"):
                rdf_format = "xml"
            if line.startswith(b"@base ") or line.startswith(
                    b"@prefix ") or line.startswith(b"PREFIX "):
                rdf_format = "turtle"
            try:
                _source.seek(0)
            except (AttributeError, UnsupportedOperation):
                raise RuntimeError("Seek failed while identifying file type.")
            except ValueError:
                raise RuntimeError("File closed while identifying file type.")
        if rdf_format == 'turtle' or rdf_format == 'n3':
            # SHACL Shapes files and Data files can have extra RDF Metadata in the
            # Top header block, including #BaseURI and #Prefix.
            # The @base line is not read here, but it is parsed in the n3 parser
            while True:
                try:
                    line = _source.readline()
                    assert line is not None and len(line) > 0
                except AssertionError:
                    break
                # Strip line from start
                while len(
                        line) > 0 and line[0:1] in b' \t\n\r\x0B\x0C\x85\xA0':
                    line = line[1:]
                # We reached the end of the line, check the next line
                if len(line) < 1:
                    continue
                # If this is not a comment, then this is the first non-comment line, we're done.
                if not line[0:1] == b'#':
                    break
                # Strip from start again, but now removing hashes too.
                while len(line) > 0 and line[0:1] in b'# \t\xA0':
                    line = line[1:]
                # Strip line from end
                while len(
                        line) > 0 and line[-1:] in b' \t\n\r\x0B\x0C\x85\xA0':
                    line = line[:-1]
                spl = line.split(b':', 1)
                if len(spl) < 2:
                    continue
                keyword = spl[0].lower()
                # Strip keyword end
                while len(keyword
                          ) > 0 and keyword[-1:] in b' \t\n\r\x0B\x0C\x85\xA0':
                    keyword = keyword[:-1]
                if len(keyword) < 1:
                    continue
                wordval = spl[1]
                # Strip wordval start
                while len(wordval
                          ) > 0 and wordval[0:1] in b' \t\n\r\x0B\x0C\x85\xA0':
                    wordval = wordval[1:]
                if len(wordval) < 1:
                    continue
                wordval_str = wordval.decode('utf-8')
                if keyword == b"baseuri":
                    public_id = wordval_str
                elif keyword == b"prefix":
                    uri_prefix = wordval_str
            try:
                _source.seek(0)
            except (AttributeError, UnsupportedOperation):
                raise RuntimeError(
                    "Seek failed while pre-parsing Turtle File.")
            except ValueError:
                raise RuntimeError(
                    "File closed while pre-parsing Turtle File.")
        target_g.parse(source=_source, format=rdf_format, publicID=public_id)
        # If the target was open to begin with, leave it open.
        if not source_was_open:
            _source.close()
        elif hasattr(_source, 'seek'):
            try:
                _source.seek(0)
            except (AttributeError, UnsupportedOperation):
                pass
            except ValueError:
                # The parser closed our file!
                pass
        source_is_graph = True
    elif source_is_graph and (target_g != source):
        # clone source into g
        if isinstance(
                target_g,
            (rdflib.Dataset, rdflib.ConjunctiveGraph)) and isinstance(
                source, (rdflib.Dataset, rdflib.ConjunctiveGraph)):
            clone_dataset(source, target_g)
        elif isinstance(target_g, rdflib.Graph) and isinstance(
                source, (rdflib.Dataset, rdflib.ConjunctiveGraph)):
            raise RuntimeError(
                "Cannot load a Dataset source into a Graph target.")
        elif isinstance(
                target_g,
            (rdflib.Dataset, rdflib.ConjunctiveGraph)) and isinstance(
                source, rdflib.Graph):
            target = rdflib.Graph(store=target_g.store, identifier=public_id)
            clone_graph(source, target)
        elif isinstance(target_g, rdflib.Graph) and isinstance(
                source, rdflib.Graph):
            clone_graph(source, target_g)
        else:
            raise RuntimeError("Cannot merge source graph into target graph.")

    if not source_is_graph:
        raise RuntimeError("Error opening graph from source.")

    if public_id:
        if uri_prefix:
            if is_imported_graph and uri_prefix == '':
                # Don't reassign blank prefix, when importing subgraph
                pass
            else:
                has_named_prefix = target_g.store.namespace(uri_prefix)
                if not has_named_prefix:
                    target_g.namespace_manager.bind(uri_prefix, public_id)
        elif not is_imported_graph:
            existing_blank_prefix = target_g.store.namespace('')
            if not existing_blank_prefix:
                target_g.namespace_manager.bind('', public_id)
    if do_owl_imports:
        if isinstance(do_owl_imports, int):
            if do_owl_imports > 3:
                return target_g
        else:
            do_owl_imports = 1

        if import_chain is None:
            import_chain = []
        if public_id and (public_id.endswith('#') or public_id.endswith('/')):
            root_id: Union[rdflib.URIRef, None] = rdflib.URIRef(public_id[:-1])
        else:
            root_id = rdflib.URIRef(public_id) if public_id else None
        done_imports = 0
        if root_id is not None:
            if isinstance(target_g, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
                gs = list(target_g.contexts())
            else:
                gs = [target_g]
            for ng in gs:
                owl_imports = list(ng.objects(root_id, rdflib.OWL.imports))
                if len(owl_imports) > 0:
                    import_chain.append(root_id)
                for o in owl_imports:
                    if o in import_chain:
                        continue
                    load_from_source(
                        o,
                        g=target_g,
                        multigraph=multigraph,
                        do_owl_imports=do_owl_imports + 1,
                        import_chain=import_chain,
                    )
                    done_imports += 1
        if done_imports < 1 and public_id is not None and root_id != public_id:
            public_id_uri = rdflib.URIRef(public_id)
            if isinstance(target_g, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
                gs = list(target_g.contexts())
            else:
                gs = [target_g]
            for ng in gs:
                owl_imports = list(
                    ng.objects(public_id_uri, rdflib.OWL.imports))
                if len(owl_imports) > 0:
                    import_chain.append(public_id_uri)
                for o in owl_imports:
                    if o in import_chain:
                        continue
                    load_from_source(
                        o,
                        g=target_g,
                        multigraph=multigraph,
                        do_owl_imports=do_owl_imports + 1,
                        import_chain=import_chain,
                    )
                    done_imports += 1
        if done_imports < 1:
            if isinstance(target_g, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
                gs = list(target_g.contexts())
            else:
                gs = [target_g]
            for ng in gs:
                ontologies = ng.subjects(rdflib.RDF.type, rdflib.OWL.Ontology)
                for ont in ontologies:
                    if ont == root_id or ont == public_id:
                        continue
                    if ont in import_chain:
                        continue
                    owl_imports = list(ng.objects(ont, rdflib.OWL.imports))
                    if len(owl_imports) > 0:
                        import_chain.append(ont)
                    for o in owl_imports:
                        if o in import_chain:
                            continue
                        load_from_source(
                            o,
                            g=target_g,
                            multigraph=multigraph,
                            do_owl_imports=do_owl_imports + 1,
                            import_chain=import_chain,
                        )
                        done_imports += 1
    return target_g
コード例 #7
0
ファイル: onstage.py プロジェクト: LvanWissen/create-datasets
def main(fp='data/onstage.nt'):

    # If there was no format issue in the streets data, this function would
    # work. Instead, download the data yourself and point to it:
    # datasets = downloadDatasets(datasets=(GEBOUWEN, PERSONS, WIJKEN))

    dsG = rdflib.Dataset()  # rdflib Dataset
    rdfSubject.db = dsG  # hook onto rdfAlchemy

    TITLE = ["ONSTAGE"]
    DESCRIPTION = [
        Literal(
            """Online Datasystem of Theatre in Amsterdam from the Golden Age to the present. This is your address for questions about the repertoire, performances, popularity and revenues of the cultural program in Amsterdam’s public theatre during the period 1637 - 1772. All data provided in this system links to archival source materials in contemporary administration.

The [Shows page](http://www.vondel.humanities.uva.nl/onstage/shows/) gives you access by date to chronological lists of the theater program, and the plays staged per day. At the [Plays page](http://www.vondel.humanities.uva.nl/onstage/plays/) you have access to the repertoire by title, and for each play you will find its performances and revenues throughout time. At the [Persons page](http://www.vondel.humanities.uva.nl/onstage/persons/) you can access the data for playwrights, actors and actresses, and translators involved in the rich national and international variety of the Amsterdam Theater productions.

Go see your favorite play!""",
            lang='en')
    ]

    DATE = Literal(datetime.datetime.now().strftime('%Y-%m-%d'),
                   datatype=XSD.datetime)

    ds = Dataset(
        create.term('id/onstage/'),
        label=TITLE,
        name=TITLE,
        dctitle=TITLE,
        description=DESCRIPTION,
        dcdescription=DESCRIPTION,
        image=URIRef(
            "http://www.vondel.humanities.uva.nl/onstage/images/logo.png"),
        url=[URIRef("http://www.vondel.humanities.uva.nl/onstage/")],
        temporalCoverage=[Literal("1637-01-01/1772-12-31")],
        spatialCoverage=[Literal("Amsterdam")],
        dateModified=DATE,
        dcdate=DATE,
        dcmodified=DATE,
        licenseprop=URIRef(
            "https://creativecommons.org/publicdomain/zero/1.0/"))

    # Add the datasets as separate graphs. Metadata on these graphs is in the

    # default graph.
    guri = create.term('id/onstage/')

    # download = DataDownload(None,
    #                         contentUrl=URIRef(uri),
    #                         encodingFormat="application/turtle")

    g = rdflib.Graph(identifier=guri)

    g.bind('schema', schema)
    g.bind('foaf', foaf)
    g.bind('dcterms', dcterms)
    g.bind('owl', OWL)
    g.bind('pnv', Namespace('https://w3id.org/pnv#'))
    g.bind(
        'onstage',
        Namespace('http://www.vondel.humanities.uva.nl/onstage/lod/vocab/#'))
    g.bind('bio', Namespace('http://purl.org/vocab/bio/0.1/'))
    g.bind('sem', Namespace('http://semanticweb.cs.vu.nl/2009/11/sem/#'))
    g.bind('skos', Namespace('http://www.w3.org/2004/02/skos/core#'))
    g.bind('time', Namespace('http://www.w3.org/2006/time#'))

    g.parse(fp, format='nt')

    dsG.add_graph(g)

    ds.triples = sum(1 for i in g.subjects())

    dsG.bind('void', void)
    dsG.bind('dcterms', dcterms)
    dsG.bind('schema', schema)

    print("Serializing!")
    dsG.serialize('datasets/onstage.trig', format='trig')
コード例 #8
0
ファイル: load.py プロジェクト: mullikine/pySHACL
def load_from_source(source,
                     g=None,
                     rdf_format=None,
                     multigraph=False,
                     do_owl_imports=False,
                     import_chain=None):
    """

    :param source:
    :param g:
    :type g: rdflib.Graph
    :param rdf_format:
    :type rdf_format: str
    :param multigraph:
    :type multigraph: bool
    :param do_owl_imports:
    :type do_owl_imports: bool|int
    :param import_chain:
    :type import_chain: dict
    :return:
    """
    source_is_graph = False
    source_is_open = False
    source_was_open = False
    source_is_file = False
    source_is_bytes = False
    filename = None
    public_id = None
    uri_prefix = None
    is_imported_graph = do_owl_imports and isinstance(do_owl_imports, int) \
                        and do_owl_imports > 1
    if isinstance(source,
                  (rdflib.Graph, rdflib.ConjunctiveGraph, rdflib.Dataset)):
        source_is_graph = True
        if g is None:
            g = source
        else:
            raise RuntimeError(
                "Cannot pass in both target=rdflib.Graph/Dataset and g=graph.")
    elif isinstance(source, IOBase) and hasattr(source, 'read'):
        source_is_file = True
        if hasattr(source, 'closed'):
            source_is_open = not bool(source.closed)
            source_was_open = source_is_open
        else:
            # Assume it is open now and it was open when we started.
            source_is_open = True
            source_was_open = True
        filename = source.name
        public_id = Path(filename).resolve().as_uri() + "#"
    elif isinstance(source, str):
        if is_windows and source.startswith('file:///'):
            public_id = source
            source_is_file = True
            filename = source[8:]
        elif not is_windows and source.startswith('file://'):
            public_id = source
            source_is_file = True
            filename = source[7:]
        elif source.startswith('http:') or source.startswith('https:'):
            public_id = source
            try:
                source, rdf_format = get_rdf_from_web(source)
            except HTTPError:
                if is_imported_graph:
                    return g
                else:
                    raise
            source_is_open = True
            filename = source.geturl()
        else:
            first_char = source[0]
            if is_windows and (first_char == '\\' or
                               (len(source) > 3 and source[1:3] == ":\\")):
                source_is_file = True
                filename = source
            elif first_char == '/' or source[0:3] == "./":
                source_is_file = True
                filename = source
            elif first_char == '#' or first_char == '@' \
                or first_char == '<' or first_char == '\n' \
                    or first_char == '{' or first_char == '[':
                # Contains some JSON or XML or Turtle stuff
                source_is_file = False
            elif len(source) < 140:
                source_is_file = True
                filename = source
        if public_id and not public_id.endswith('#'):
            public_id = "{}#".format(public_id)
        if not source_is_file and not source_is_open:
            source = source.encode('utf-8')
            source_is_bytes = True
    elif isinstance(source, bytes):
        if (is_windows and source.startswith(b'file:///')) or \
           (not is_windows and source.startswith(b'file://')) or \
           source.startswith(b'http:') or source.startswith(b'https:'):
            raise ValueError(
                "file:// and http:// strings should be given as str, not bytes."
            )
        first_char = source[0:1]
        if first_char == b'#' or first_char == b'@' \
            or first_char == b'<' or first_char == b'\n' \
                or first_char == b'{' or first_char == b'[':
            # Contains some JSON or XML or Turtle stuff
            source_is_file = False
        elif len(source) < 140:
            source_is_file = True
            filename = source.decode('utf-8')
        if not source_is_file:
            source_is_bytes = True
    else:
        raise ValueError("Cannot determine the format of the input graph")
    if g is None:
        g = rdflib.Dataset() if multigraph else rdflib.Graph()
    else:
        if not isinstance(
                g, (rdflib.Graph, rdflib.Dataset, rdflib.ConjunctiveGraph)):
            raise RuntimeError("Passing in g must be a Graph.")
    if filename:
        if filename.endswith('.ttl'):
            rdf_format = rdf_format or 'turtle'
        elif filename.endswith('.nt'):
            rdf_format = rdf_format or 'nt'
        elif filename.endswith('.n3'):
            rdf_format = rdf_format or 'n3'
        elif filename.endswith('.json'):
            rdf_format = rdf_format or 'json-ld'
        elif filename.endswith('.nq') or filename.endswith('.nquads'):
            rdf_format = rdf_format or 'nquads'
        elif filename.endswith('.trig'):
            rdf_format = rdf_format or 'trig'
        elif filename.endswith('.xml') or filename.endswith('.rdf'):
            rdf_format = rdf_format or 'xml'
    if source_is_file and filename and not source_is_open:
        filename = Path(filename).resolve()
        if not public_id:
            public_id = Path(filename).as_uri() + "#"
        source = open(filename, mode='rb')
        source_is_open = True
    if source_is_open:
        data = source.read()
        # If the target was open to begin with, leave it open.
        if not source_was_open:
            source.close()
        elif hasattr(source, 'seek'):
            try:
                source.seek(0)
            except Exception:
                pass
        source = data
        source_is_bytes = True

    if source_is_bytes:
        source = BytesIO(source)
        if (rdf_format == "json-ld"
                or rdf_format == "json") and not has_json_ld:
            raise RuntimeError(
                "Cannot load a JSON-LD file if rdflib_jsonld is not installed."
            )
        if rdf_format == 'turtle' or rdf_format == 'n3':
            # SHACL Shapes files and Data files can have extra RDF Metadata in the
            # Top header block, including #BaseURI and #Prefix.
            while True:
                try:
                    l = source.readline()
                    assert l is not None and len(l) > 0
                except AssertionError:
                    break
                # Strip line from start
                while len(l) > 0 and l[0:1] in b' \t\n\r\x0B\x0C\x85\xA0':
                    l = l[1:]
                # We reached the end of the line, check the next line
                if len(l) < 1:
                    continue
                # If this is not a comment, then this is the first non-comment line, we're done.
                if not l[0:1] == b'#':
                    break
                # Strip from start again, but now removing hashes too.
                while len(l) > 0 and l[0:1] in b'# \t\xA0':
                    l = l[1:]
                # Strip line from end
                while len(l) > 0 and l[-1:] in b' \t\n\r\x0B\x0C\x85\xA0':
                    l = l[:-1]
                spl = l.split(b':', 1)
                if len(spl) < 2:
                    continue
                keyword = spl[0].lower()
                # Strip keyword end
                while len(keyword
                          ) > 0 and keyword[-1:] in b' \t\n\r\x0B\x0C\x85\xA0':
                    keyword = keyword[:-1]
                if len(keyword) < 1:
                    continue
                wordval = spl[1]
                # Strip wordval start
                while len(wordval
                          ) > 0 and wordval[0:1] in b' \t\n\r\x0B\x0C\x85\xA0':
                    wordval = wordval[1:]
                if len(wordval) < 1:
                    continue
                wordval = wordval.decode('utf-8')
                if keyword == b"baseuri":
                    public_id = wordval
                elif keyword == b"prefix":
                    uri_prefix = wordval
            source.seek(0)
        g.parse(source=source, format=rdf_format, publicID=public_id)
        source_is_graph = True

    if not source_is_graph:
        raise RuntimeError("Error opening graph from source.")

    if public_id:
        if uri_prefix:
            if is_imported_graph and uri_prefix == '':
                # Don't reassign blank prefix, when importing subgraph
                pass
            else:
                has_named_prefix = g.store.namespace(uri_prefix)
                if not has_named_prefix:
                    g.namespace_manager.bind(uri_prefix, public_id)
        elif not is_imported_graph:
            existing_blank_prefix = g.store.namespace('')
            if not existing_blank_prefix:
                g.namespace_manager.bind('', public_id)
    if do_owl_imports:
        if isinstance(do_owl_imports, int):
            if do_owl_imports > 3:
                return g
        else:
            do_owl_imports = 1

        if import_chain is None:
            import_chain = []
        if public_id and (public_id.endswith('#') or public_id.endswith('/')):
            root_id = rdflib.URIRef(public_id[:-1])
        else:
            root_id = rdflib.URIRef(public_id) if public_id else None
        done_imports = 0
        if root_id is not None:
            if isinstance(g, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
                gs = list(g.contexts())
            else:
                gs = [g]
            for ng in gs:
                owl_imports = list(ng.objects(root_id, rdflib.OWL.imports))
                if len(owl_imports) > 0:
                    import_chain.append(root_id)
                for o in owl_imports:
                    if o in import_chain:
                        continue
                    load_from_source(o,
                                     g=g,
                                     multigraph=multigraph,
                                     do_owl_imports=do_owl_imports + 1,
                                     import_chain=import_chain)
                    done_imports += 1
        if done_imports < 1 and public_id is not None and root_id != public_id:
            public_id_uri = rdflib.URIRef(public_id)
            if isinstance(g, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
                gs = list(g.contexts())
            else:
                gs = [g]
            for ng in gs:
                owl_imports = list(
                    ng.objects(public_id_uri, rdflib.OWL.imports))
                if len(owl_imports) > 0:
                    import_chain.append(public_id_uri)
                for o in owl_imports:
                    if o in import_chain:
                        continue
                    load_from_source(o,
                                     g=g,
                                     multigraph=multigraph,
                                     do_owl_imports=do_owl_imports + 1,
                                     import_chain=import_chain)
                    done_imports += 1
        if done_imports < 1:
            if isinstance(g, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
                gs = list(g.contexts())
            else:
                gs = [g]
            for ng in gs:
                ontologies = ng.subjects(rdflib.RDF.type, rdflib.OWL.Ontology)
                for ont in ontologies:
                    if ont == root_id or ont == public_id:
                        continue
                    if ont in import_chain:
                        continue
                    owl_imports = list(ng.objects(ont, rdflib.OWL.imports))
                    if len(owl_imports) > 0:
                        import_chain.append(ont)
                    for o in owl_imports:
                        if o in import_chain:
                            continue
                        load_from_source(o,
                                         g=g,
                                         multigraph=multigraph,
                                         do_owl_imports=do_owl_imports + 1,
                                         import_chain=import_chain)
                        done_imports += 1
    return g
コード例 #9
0
def main():

    # If there was no format issue in the streets data, this function would
    # work. Instead, download the data yourself and point to it:
    # datasets = downloadDatasets(datasets=(GEBOUWEN, PERSONS, WIJKEN))
    datasets = [
        ('https://adamlink.nl/data/rdf/streets', 'data/adamlinkstraten.ttl'),
        ('https://adamlink.nl/data/rdf/buildings',
         'data/adamlinkgebouwen.ttl'),
        ('https://adamlink.nl/data/rdf/districts', 'data/adamlinkbuurten.ttl'),
        ('https://adamlink.nl/data/rdf/persons', 'data/adamlinkpersonen.ttl')
    ]

    dsG = rdflib.Dataset()  # rdflib Dataset
    rdfSubject.db = dsG  # hook onto rdfAlchemy

    TITLE = ["Adamlink"]
    DESCRIPTION = [
        Literal(
            """Adamlink, een project van [Stichting AdamNet](http://www.adamnet.nl), wil Amsterdamse collecties verbinden en als LOD beschikbaar maken.

Om collecties te verbinden hebben we identifiers ([URIs](https://nl.wikipedia.org/wiki/Uniform_resource_identifier)) voor concepten als straten, personen en gebouwen nodig. Vaak zijn die al beschikbaar, bijvoorbeeld in de [BAG](https://nl.wikipedia.org/wiki/Basisregistraties_Adressen_en_Gebouwen), [RKDartists](https://rkd.nl/nl/explore/artists) of [Wikidata](https://www.wikidata.org).

Hier voegen we onze eigen Adamlink URIs aan die identifiers toe. Niet omdat we die beter vinden dan BAG, RKDartists of Wikidata, maar omdat bepaalde concepten - verdwenen straten bijvoorbeeld - niet in genoemde authority sets terug te vinden zijn. En omdat we op Adamlink allerlei naamvarianten van concepten bijeen kunnen brengen.

We proberen Adamlink als hub laten fungeren, door bijvoorbeeld bij een straat naar zowel BAG als Wikidata te verwijzen. Regelmatig nemen we data eerst op Adamlink op, bijvoorbeeld alle geportretteerden die we in de beeldbank van het Stadsarchief tegenkomen, om die personen vervolgens (zowel scriptsgewijs als handmatig) te verbinden met bestaande authority sets als Wikidata, Ecartico of RKDartists.

Maakt en publiceert u data met (historische) straat-, gebouw- of persoonsnamen? Gebruik dan altijd een identifier die door zoveel mogelijk anderen ook gebruikt wordt. U heeft dan toegang tot alle andere informatie die over zo'n concept beschikbaar is, zoals naamsvarianten of de locatie of de tijd waarin het concept leefde of bestond. En u verbindt uw data ook met de collecties van Amsterdamse erfgoedinstellingen.""",
            lang='nl'),
        Literal("Reference data for Amsterdam collections.", lang='en')
    ]
    DATE = Literal(datetime.datetime.now().strftime('%Y-%m-%d'),
                   datatype=XSD.datetime)

    ds = Dataset(create.term('id/adamlink/'),
                 label=TITLE,
                 name=TITLE,
                 dctitle=TITLE,
                 description=DESCRIPTION,
                 dcdescription=DESCRIPTION,
                 image=URIRef("https://adamlink.nl/img/footerimg.jpg"),
                 url=[URIRef("https://www.adamlink.nl/")],
                 temporalCoverage=[Literal("1275-10-27/..")],
                 spatialCoverage=[Literal("Amsterdam")],
                 dateModified=DATE,
                 dcdate=DATE,
                 dcmodified=DATE)

    subdatasets = []

    # Add the datasets as separate graphs. Metadata on these graphs is in the
    # default graph.
    for uri, fp in datasets:

        graphtype = uri.replace(PREFIX, '')
        guri = create.term('id/adamlink/' + graphtype + '/')

        TITLE = [f"Adamlink {graphtype.title()}"]
        DESCRIPTION = [
            Literal(
                f"Data over {graphtype} uit Adamlink - Referentiedata voor Amsterdamse collecties.",
                lang='nl'),
            Literal(
                f"Data on {graphtype} from Adamlink - Reference data for Amsterdam collections.",
                lang='en')
        ]

        download = DataDownload(None,
                                contentUrl=URIRef(uri),
                                encodingFormat="application/turtle")

        subds = Dataset(guri,
                        label=TITLE,
                        name=TITLE,
                        dctitle=TITLE,
                        description=DESCRIPTION,
                        dcdescription=DESCRIPTION,
                        url=[URIRef("https://www.adamlink.nl/")],
                        temporalCoverage=[Literal("1275-10-27/..")],
                        spatialCoverage=[Literal("Amsterdam")],
                        distribution=[download])

        # Add data to the respective graph
        print("Parsing", uri)
        subgraph = rdflib.Graph(identifier=guri)
        subgraph.parse(fp, format='turtle')

        dsG.add_graph(subgraph)
        subdatasets.append(subds)

    print("Adding more meta data and dataset relations")
    for subds in subdatasets:
        subds.isPartOf = ds
        subds.inDataset = ds

        subds.triples = sum(1 for i in subgraph.subjects())

    ds.hasPart = subdatasets
    ds.subset = subdatasets

    ds.triples = sum(
        1
        for i in dsG.graph(identifier=create.term('id/adamlink/')).subjects())

    dsG.bind('void', void)
    dsG.bind('dcterms', dcterms)
    dsG.bind('schema', schema)

    print("Serializing!")
    dsG.serialize('datasets/adamlink.trig', format='trig')