Example #1
0
    def process_dataset_description(self):
        log.info("# Processing dataset description.")
        input_data = self.dataset_description

        log.debug(f"Input data: {input_data}")
        if not dataset_description_validator(input_data):
            log.error(
                "The dataset description document did not validate. These errors were "
                "reported:"
            )
            log.error(pformat(dataset_description_validator.errors))
            raise SystemExit(1)
        log.debug("Input data validated.")

        self.data_provider = URIRef(input_data["data_provider"])
        self.file_namespace = file_namespace = input_data["file_namespace"]
        self.creation_uuid_ns = uuid.uuid5(uuid.NAMESPACE_URL, self.file_namespace)

        # initialize graph
        self.graph_uuid = uuid.uuid5(uuid.NAMESPACE_URL, file_namespace)
        graph_iri = f"{ENTITIES_NAMESPACE}{self.graph_uuid}"
        graph = self.graph = Graph(identifier=graph_iri)

        # describe graph
        s = URIRef(graph_iri)
        for p, o in [
            (RDF.type, m4p0.RDFGraph),
            (RDFS.label, Literal(f"{file_namespace} @ {self.import_time_string}")),
            (edm.dataProvider, self.data_provider),
            (dc.date, Literal(self.import_time)),
        ]:
            graph.add((s, p, o))
Example #2
0
    def __init__(self, path: Path, config: SimpleNamespace):
        log.info(f"Setting up import from {path}")

        self.config = config
        self.import_time = datetime.now()
        self.import_time_string = self.import_time.isoformat(timespec="seconds")

        log_folder = path / "logs"
        try:
            log_folder.mkdir(exist_ok=True)
        except FileNotFoundError:
            log.error(f"The import folder '{path}' doesn't exist. Aborting.")
            raise SystemExit(1)
        set_file_log_handler(log_folder / f"{self.import_time_string}.log")

        self.dataset_description = yaml.load(
            (path / "dataset.yml").read_text(), Loader=yaml.SafeLoader
        )

        self.source_files: Dict[str, Path] = {}
        for name in ("images", "audio_video", "3d", "entities"):
            data_file_path = path / f"{name}.csv"
            if data_file_path.exists():
                self.source_files[name] = data_file_path
        if not any(x in self.source_files for x in ("images", "audio_video", "3d")):
            log.error(
                "At least one of 'images.csv', 'audio_video.csv' or '3d.csv' "
                "must be present in an import folder."
            )
            raise SystemExit(1)

        self.graph: Optional[Graph] = None
        self.creation_iris: Set[URIRef] = set()
        self.creation_uuid_ns: Optional[uuid.UUID] = None
        self.encountered_filenames: Set[str] = set()
Example #3
0
    def process_3d_data(self):
        source_file = self.source_files.get("3d")
        if source_file is None:
            log.debug("No 3D objects' metadata found.")
            return

        log.info("# Processing 3D objects' metadata.")
        self.process_metadata_file(source_file, self.add_3d_fields, _3d_validator)
        log.info("Done.")
Example #4
0
    def process_images_data(self):
        source_file = self.source_files.get("images")
        if source_file is None:
            log.debug("No images' metadata found.")
            return

        log.info("# Processing images' metadata.")
        self.process_metadata_file(source_file, self.add_core_fields, coreset_validator)
        log.info("Done.")
Example #5
0
    def process_entities_data(self):
        if self.source_files.get("entities") is None:
            log.debug("No entities' metadata found.")
            return
        log.info("# Processing entities' metadata.")

        graph = self.graph

        with self.source_files["entities"].open("rt", newline="") as f:
            csv_reader = csv.DictReader(f)
            for row in csv_reader:

                identifier = row.get("Identifier")

                if not entity_validator(row):
                    log.error(
                        "An entity description did not validate. These errors "
                        f"were reported for the identifier {identifier}:"
                    )
                    log.error(pformat(entity_validator.errors))
                    raise SystemExit(1)

                s = self.create_related_entity_iri(identifier)

                if len(set(graph.triples((None, m4p0.refersToMuseumObject, s)))) < 1:
                    log.error(
                        "This identifier is not referenced in the metadata of any "
                        f"digital object in the created graph: {identifier}"
                    )
                    raise SystemExit(1)

                label = Literal(row["Bezeichnung"])
                graph.add((s, RDF.type, m4p0.MuseumObject))
                graph.add((s, m4p0.museumObjectTitle, label))
                graph.add((s, RDFS.label, label))

                if "URL" in row:
                    graph.add((s, edm.isShownAt, URIRef(row["URL"])))

                arbritrary_fields = {
                    k: v for k, v in row.items() if k not in entity_description_schema
                }
                if arbritrary_fields:
                    blank_node = BNode()
                    graph.add((blank_node, RDF.type, m4p0.JSONObject))
                    graph.add(
                        (
                            blank_node,
                            m4p0.jsonData,
                            Literal(json.dumps(arbritrary_fields)),
                        )
                    )
                    graph.add((s, m4p0.isDescribedBy, blank_node))

        log.info("Done.")
Example #6
0
    def process_audio_video_data(self):
        source_file = self.source_files.get("audio_video")
        if source_file is None:
            log.debug("No audios' or videos' metadata found.")
            return

        log.info("# Processing audios' and videos' metadata")
        self.process_metadata_file(
            source_file, self.add_audio_video_fields, audio_video_validator
        )
        log.info("Done.")
Example #7
0
 def post_query(self, query: str):
     response = httpx.post(
         self.config.sparql_endpoint,
         auth=(self.config.sparql_user, self.config.sparql_pass),
         data=query.encode(),
         headers={
             "Content-Type": "application/sparql-update; charset=UTF-8",
             "Accept": "text/boolean",
         },
     )
     try:
         response.raise_for_status()
     except Exception:
         log.exception("Something went wrong")
         raise SystemExit(1)
     else:
         log.info(f"Received response: {response.content.decode()}")
Example #8
0
    def submit(self):
        # submit the triples to the SPARQL-endpoint

        log.info("# Submitting graph data via SPARQL.")

        graph_iri = self.graph.identifier

        turtle_representation: str = self.graph.serialize(
            format="turtle"
        ).decode().splitlines()

        deletion_query = f"""\
        DELETE {{?s ?p ?o}}
        WHERE {{ GRAPH <{graph_iri}> {{?s ?p ?o}} }}
        """

        prefixes = []
        for i, line in enumerate(turtle_representation):
            if line.startswith("@prefix "):
                prefixes.append("PREFIX " + line[8:-2])
            else:
                break

        prefixes_header = "\n".join(prefixes) + "\n"
        statements = "\n".join(turtle_representation[i + 1 :])  # noqa: E203

        insert_query = f"""\
        {prefixes_header}

        INSERT {{
          GRAPH <{graph_iri}> {{
            {statements}
          }}
        }} WHERE {{}}
        """

        if self.config.review:
            pager(insert_query)
            review_passed = input("Proceed? [yN]: ").lower()
            if not review_passed or review_passed[0] != "y":
                log.critical("User aborted after reviewing the SPARQL query.")
                raise SystemExit(1)

        log.debug("Generated SPARQL Query:")
        log.debug(insert_query)

        log.info(f"Deleting all existing triples from the graph <{graph_iri}>.")
        self.post_query(deletion_query)

        log.info(
            f"Posting generated triples to {self.config.sparql_endpoint} as "
            f"{self.config.sparql_user}."
        )
        self.post_query(insert_query)