Exemple #1
0
    def find_metadata(self):

        for child in self.root:

            style = child.attrib["style"] if "style" in child.attrib else ""
            value = child.attrib["value"] if "value" in child.attrib else ""
            # Dictionary of ontology level metadata
            if "shape=document" in style:
                text = clean_html_tags(value)
                annotations = text.split("|")
                for ann in annotations:
                    try:
                        ann_prefix = ann.split(":")[0].strip()
                        ann_type = ann.split(":")[1].strip()
                        if ann_type == "imports":
                            ann_value = ann.split(":")[2:]
                            ann_value = ":".join(ann_value).strip()
                        else:
                            ann_value = ann.split(":")[2].strip()
                        if ann_prefix + ":" + ann_type in self.ontology_metadata:
                            self.ontology_metadata[ann_prefix + ":" +
                                                   ann_type].append(ann_value)
                        else:
                            self.ontology_metadata[ann_prefix + ":" +
                                                   ann_type] = [ann_value]
                    except:
                        error = {
                            "message": "Problems in the text of the Metadata",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Metadata"].append(error)
                        continue

        return self.ontology_metadata
Exemple #2
0
    def find_namespaces(self):

        for child in self.root:
            style = child.attrib["style"] if "style" in child.attrib else ""
            value = child.attrib["value"] if "value" in child.attrib else ""
            # Dictionary of Namespaces
            if "shape=note" in style:
                text = clean_html_tags(value)
                namespaces = text.split("|")
                namespaces = [
                    item for item in namespaces if item.strip() != ""
                ]
                for ns in namespaces:
                    try:
                        ns = ns.strip()
                        prefix = ns.split(":")[0].strip()
                        ontology_uri = ns.split("http")[-1].strip()
                        ontology_uri = "http" + ontology_uri
                        self.namespaces[prefix] = ontology_uri
                    except:
                        error = {
                            "message": "Problems in the text of the Namespace",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Namespaces"].append(error)
                        continue
        return self.namespaces
Exemple #3
0
    def find_attribute_values(self):

        for child in self.root:

            id = child.attrib["id"]
            value = child.attrib["value"] if "value" in child.attrib else None

            if value is None:
                continue

            value = clean_html_tags(value)

            if """ in value or "\"" in value:
                attribute = {}
                attribute["xml_object"] = child
                attribute["type"] = None
                attribute["lang"] = None

                try:
                    # Finding the value
                    if """ in value:

                        attribute["value"] = value.split(""")[1]
                    elif "\"" in value:
                        reg_exp = '"(.*?)"'
                        attribute["value"] = re.findall(reg_exp, value)[0]

                    # Finding the type
                    if "^^" in value:
                        attribute["type"] = value.split("^^")[-1]

                    elif "@" in value:
                        attribute["lang"] = value.split("@")[-1]

                except:
                    error = {
                        "message": "Problems in the text of the literal",
                        "shape_id": id,
                        "value": value
                    }
                    self.errors["Individual"].append(error)
                    continue

                self.attributes[id] = attribute

        return self.attributes
Exemple #4
0
    def find_individuals(self):

        for child in self.root:

            id = child.attrib["id"]
            style = child.attrib["style"] if "style" in child.attrib else ""

            if "value" in child.attrib:
                value = child.attrib["value"]
            else:
                continue
            # List of individuals
            if "fontStyle=4" in style or "<u>" in value:
                individual = {}
                individual["xml_object"] = child
                value = clean_html_tags(value)
                try:
                    individual["prefix"] = value.split(":")[0]
                    individual["uri"] = value.split(":")[1]
                    individual["prefix"][0]  # Check if error
                    individual["uri"][1]  # Check if error
                    individual["type"] = None

                    individual["uri"] = re.sub(" ", "", individual["uri"])

                except:
                    error = {
                        "message": "Problems in the text of the Metadata",
                        "shape_id": id,
                        "value": value
                    }
                    self.errors["Individual"].append(error)
                    continue

                self.individuals[id] = individual

                continue

        return self.individuals
Exemple #5
0
    def find_concepts_and_attributes(self):

        for child in self.root:

            id = child.attrib["id"]
            style = child.attrib["style"] if "style" in child.attrib else ""
            value = child.attrib["value"] if "value" in child.attrib else ""
            attributes_found = False

            try:
                # Check that neither of these components passes, this is because concepts
                # and attributes shape do not have a specific characteristic to differentiate them
                # and we have to use the characteristics of the rest of the shapes
                if "text" in style or "edgeLabel" in style:
                    continue
                if "edge" in child.attrib:
                    continue
                if "ellipse" in style:
                    continue
                if "rhombus" in style:
                    continue
                if "shape" in style:
                    continue
                if "fontStyle=4" in style or "<u>" in value:
                    continue
                if "&quot;" in value or "^^" in value:
                    continue
                concept = {}
                attribute_block = {}
                attribute_block["xml_object"] = child

                p1, p2, p3, p4 = get_corners_rect_child(child)

                # We need a second iteration because we need to know if there is a block
                # on top of the current block, that determines if we are dealing with a class or attributes
                for child2 in self.root:
                    style2 = child2.attrib[
                        "style"] if "style" in child2.attrib else ""
                    # Filter all the elements except attributes and classes
                    if "text" in style2 or "edgeLabel" in style2:
                        continue
                    if "edge" in child2.attrib:
                        continue
                    if "ellipse" in style2:
                        continue
                    if "rhombus" in style2:
                        continue
                    if "shape" in style2:
                        continue

                    p1_support, p2_support, p3_support, p4_support = get_corners_rect_child(
                        child2)
                    dx = abs(p1[0] - p2_support[0])
                    dy = abs(p1[1] - p2_support[1])

                    if dx < 5 and dy < 5:
                        attributes = []
                        value = clean_html_tags(value)
                        attribute_list = value.split("|")
                        domain = False if "dashed=1" in style else child2.attrib[
                            "id"]
                        for attribute_value in attribute_list:
                            attribute = {}
                            attribute_value_cleaned = clean_uri(
                                attribute_value)
                            try:
                                attribute[
                                    "prefix"] = attribute_value_cleaned.split(
                                        ":")[0].strip()
                                attribute["prefix"][
                                    0]  # Check if error in text
                                attribute[
                                    "uri"] = attribute_value_cleaned.split(
                                        ":")[1].strip()

                                # Taking into account possible spaces in the uri of the concept
                                attribute["uri"] = re.sub(
                                    " ", "", attribute["uri"])

                                attribute["prefix"][
                                    1]  # Check if error in text
                                attribute["label"] = create_label(
                                    attribute["uri"], "property")
                            except:
                                error = {
                                    "message":
                                    "Problems in the text of the attribute",
                                    "shape_id": id,
                                    "value": attribute_value_cleaned
                                }
                                self.errors["Attributes"].append(error)
                                continue

                            try:
                                if len(attribute_value.split(":")) > 2:
                                    final_datatype = attribute_value.split(
                                        ":")[2].strip()
                                    final_datatype = final_datatype[0].lower(
                                    ) + final_datatype[1:]
                                    attribute["datatype"] = final_datatype
                                else:
                                    attribute["datatype"] = None
                            except:
                                error = {
                                    "message":
                                    "Problems in the datatype of the attribute",
                                    "shape_id": id,
                                    "value": attribute_value_cleaned
                                }
                                self.errors["Attributes"].append(error)
                                continue

                            if attribute["datatype"] is None or attribute[
                                    "datatype"] == "":
                                attribute["range"] = False
                            else:
                                attribute["range"] = True

                            attribute["domain"] = domain

                            # Existential Universal restriction evaluation
                            if "(all)" in attribute_value or "∀" in attribute_value:
                                attribute["allValuesFrom"] = True
                            else:
                                attribute["allValuesFrom"] = False

                            if "(some)" in attribute_value or "∃" in attribute_value:
                                attribute["someValuesFrom"] = True
                            else:
                                attribute["someValuesFrom"] = False

                            attribute[
                                "functional"] = True if "(F)" in attribute_value else False

                            # Cardinality restriction evaluation
                            try:
                                max_min_card = re.findall(
                                    "\(([0-9][^)]+)\)", attribute_value)
                                max_min_card = max_min_card[-1] if len(
                                    max_min_card) > 0 else None
                                if max_min_card is None:
                                    attribute["min_cardinality"] = None
                                    attribute["max_cardinality"] = None
                                else:
                                    max_min_card = max_min_card.split("..")
                                    attribute[
                                        "min_cardinality"] = max_min_card[0]
                                    attribute[
                                        "max_cardinality"] = max_min_card[1]
                            except:
                                error = {
                                    "message":
                                    "Problems in cardinality definition",
                                    "shape_id": id,
                                    "value": attribute_value_cleaned
                                }
                                self.errors["Attributes"].append(error)
                                continue

                            if attribute["min_cardinality"] == "0":
                                attribute["min_cardinality"] = None

                            if attribute["max_cardinality"] == "N":
                                attribute["max_cardinality"] = None

                            if attribute["min_cardinality"] == attribute[
                                    "max_cardinality"]:
                                attribute["cardinality"] = attribute[
                                    "min_cardinality"]
                                attribute["min_cardinality"] = None
                                attribute["max_cardinality"] = None
                            else:
                                attribute["cardinality"] = None

                            attributes.append(attribute)
                        attribute_block["attributes"] = attributes
                        attribute_block["concept_associated"] = child2.attrib[
                            "id"]
                        self.attribute_blocks[id] = attribute_block
                        attributes_found = True
                        break
                # If after a dense one to all evaluation the object selected cannot be associated
                # to any other object it means that it is a class
                #value = clean_html_tags(value).strip()
                if not attributes_found and value != "":

                    # First we have to verify they are actually concepts

                    # One way is to verify breaks in the text
                    value = clean_html_tags(value).strip()
                    if "|" in value:
                        error = {
                            "message": "Problems in text of the Concept",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Concepts"].append(error)

                        continue

                    # Other option is to verify things like functionality, some, all, etc.
                    if "(F)" in value or "(some)" in value or "(all)" in value or "∀" in value or "∃" in value:
                        error = {
                            "message":
                            "Attributes not attached to any concept",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Attributes"].append(error)
                        continue

                    # If datatype is mentioned
                    if len(value.split(":")) > 2:
                        error = {
                            "message":
                            "Attributes not attached to any concept",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Attributes"].append(error)
                        continue

                    # If cardinality is indicated
                    if len(value.split("..")) > 1:
                        error = {
                            "message":
                            "Attributes not attached to any concept",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Attributes"].append(error)
                        continue

                    if "\"" in value:
                        continue

                    value = clean_html_tags(value)
                    try:
                        concept["prefix"] = value.split(":")[0].strip()
                        concept["uri"] = value.split(":")[1].strip()

                        concept["prefix"][0]  # Check if error
                        concept["uri"][1]  # Check if error

                        # Taking into account possible spaces in the uri of the concept
                        concept["uri"] = re.sub(" ", "", concept["uri"])

                        concept["label"] = create_label(
                            concept["uri"], "class")
                        concept["xml_object"] = child
                    except:
                        error = {
                            "message": "Problems in text of the concept",
                            "shape_id": id,
                            "value": value
                        }
                        self.errors["Concepts"].append(error)
                        continue

                    self.concepts[id] = concept

            except:
                continue

        return self.concepts, self.attribute_blocks
Exemple #6
0
    def find_rhombuses(self):

        for child in self.root:

            id = child.attrib["id"]
            style = child.attrib["style"] if "style" in child.attrib else ""
            value_html_clean = clean_html_tags(
                child.attrib["value"]) if "value" in child.attrib else None

            if "rhombus" in style:

                rhombus = {}
                rhombus["xml_object"] = child

                try:
                    type = value_html_clean.split(">>")[0].split(
                        "<<")[-1].strip()
                    rhombus["type"] = type

                    value = value_html_clean.split("|")[-1].strip()
                    value = value.split(">>")[-1].strip()
                    prefix = value.split(":")[0].strip()
                    uri = value.split(":")[1].strip()

                    uri = re.sub(" ", "", uri)

                    rhombus["prefix"] = prefix
                    rhombus["uri"] = uri

                    self.rhombuses[id] = rhombus

                except:
                    error = {"shape_id": id, "value": value_html_clean}
                    self.errors["Rhombuses"].append(error)
                    continue

                if type == "owl:ObjectProperty":

                    relation_uris = []

                    for relation_id, relation in self.relations.items():
                        if "uri" in relation:
                            relation_uris.append(relation["uri"])

                    if uri not in relation_uris:

                        uri = re.sub(" ", "", uri)

                        relation = {}
                        relation["source"] = None
                        relation["target"] = None
                        relation["xml_object"] = child
                        relation["type"] = type
                        relation["prefix"] = prefix
                        relation["uri"] = uri
                        relation["label"] = create_label(uri, "property")
                        relation["domain"] = False
                        relation["range"] = False
                        relation["allValuesFrom"] = False
                        relation["someValuesFrom"] = False
                        relation["functional"] = False
                        relation["inverse_functional"] = False
                        relation["transitive"] = False
                        relation["symmetric"] = False

                    self.relations[id] = relation

                elif type == "owl:DatatypeProperty":

                    attribute_uris = []

                    for attribute_block_id, attribute_block in self.attribute_blocks.items(
                    ):
                        attributes = attribute_block["attributes"]
                        for attribute in attributes:
                            attribute_uris.append(attribute["uri"])

                    if uri not in attribute_uris:
                        attribute = {}
                        attribute_block = {}
                        attribute_block["xml_object"] = child
                        attribute["prefix"] = prefix
                        attribute["uri"] = uri
                        attribute["label"] = create_label(uri, "property")
                        attribute["datatype"] = None
                        attribute["functional"] = False
                        attribute["domain"] = False
                        attribute["range"] = False
                        attribute["allValuesFrom"] = False
                        attribute["someValuesFrom"] = False
                        attribute["min_cardinality"] = None
                        attribute["max_cardinality"] = None
                        attribute_block["attributes"] = [attribute]

                    self.attribute_blocks[id] = attribute_block

        return self.rhombuses, self.errors
Exemple #7
0
    def find_relations(self):

        for child in self.root:

            id = child.attrib["id"]
            style = child.attrib["style"] if "style" in child.attrib else ""
            value = clean_html_tags(
                child.attrib["value"]) if "value" in child.attrib else None
            ellipse_connection_detected = False

            if "edge" not in child.attrib:
                continue

            relation = {}
            source = child.attrib[
                "source"] if "source" in child.attrib else None
            target = child.attrib[
                "target"] if "target" in child.attrib else None

            relation["source"] = source
            relation["target"] = target
            relation["xml_object"] = child

            if value is None or len(value) == 0:

                # Looking for ellipses in a second iteration
                for child2 in self.root:
                    style2 = child2.attrib[
                        "style"] if "style" in child2.attrib else ""
                    if source == child2.attrib["id"] and "ellipse" in style2:
                        # This edge is part of a unionOf / intersectionOf construct
                        # it is not useful beyond that construction
                        relation["type"] = "ellipse_connection"
                        ellipse_connection_detected = True
                        break
                if ellipse_connection_detected:
                    self.relations[id] = relation
                    continue

                # Sometimes edges have their value not embedded into the edge itself, at least not in the
                # "value" parameter of the object. We can track their associated value by looking for free text
                # and evaluating the "parent" parameter which will point to an edge.
                for child2 in self.root:
                    style2 = child2.attrib[
                        "style"] if "style" in child2.attrib else ""
                    if ("text" in style2 or "edgeLabel"
                            in style2) and id == child2.attrib["parent"]:
                        value = clean_html_tags(child2.attrib["value"])
                        break

                if relation["source"] is None:
                    error = {
                        "message":
                        "Domain side of the relation is not connected to any shape, please check this",
                        "shape_id": id,
                        "value": value
                    }
                    self.errors["Arrows"].append(error)

                if relation["target"] is None:
                    error = {
                        "message":
                        "Range side of the relation is not connected to any shape, please check this",
                        "shape_id": id,
                        "value": value
                    }
                    self.errors["Arrows"].append(error)
                # If after the evaluation of free text we cannot find any related text to the edge
                # we can say for sure that it is a "subclass" or "type" relationship
                if value is None or len(value) == 0:
                    # Check for both sides of the edge, sometimes it can be tricky.
                    if "endArrow=block" in style or "startArrow=block" in style:
                        relation["type"] = "rdfs:subClassOf"
                    elif "endArrow=open" in style or "startArrow=open" in style:
                        relation["type"] = "rdf:type"
                    else:
                        error = {
                            "message": "Could not recognize type of arrow",
                            "shape_id": id,
                            "value": "No value"
                        }
                        self.errors["Arrows"].append(error)
                    self.relations[id] = relation
                    continue

            # Detection of special type of edges
            edge_types = [
                "rdfs:subClassOf", "rdf:type", "owl:equivalentClass",
                "owl:disjointWith", "owl:complementOf", "rdfs:subPropertyOf",
                "owl:equivalentProperty", "owl:inverseOf", "rdfs:domain",
                "rdfs:range"
            ]

            edge_type_founded = False

            for edge_type in edge_types:
                if edge_type in value:
                    relation["type"] = edge_type
                    self.relations[id] = relation
                    edge_type_founded = True
                    break

            if edge_type_founded:
                continue

            # Domain Range evaluation
            if "dashed=1" in style:
                if "startArrow=oval" not in style or "startFill=0" in style:
                    relation["domain"] = False
                    relation["range"] = False
                elif "startFill=1" in style:
                    relation["domain"] = source
                    relation["range"] = False

            elif "dashed=1" not in style:
                if "startArrow=oval" not in style or "startFill=1" in style:
                    relation["domain"] = source
                    relation["range"] = target
                elif "startFill=0" in style:
                    relation["domain"] = False
                    relation["range"] = target

            # Existential Universal restriction evaluation
            if "allValuesFrom" in value or "(all)" in value or "∀" in value:
                relation["allValuesFrom"] = True
            else:
                relation["allValuesFrom"] = False

            if "someValuesFrom" in value or "(some)" in value or "∃" in value:
                relation["someValuesFrom"] = True
            else:
                relation["someValuesFrom"] = False

            # Property restriction evaluation
            relation["functional"] = True if "(F)" in value else False
            relation["inverse_functional"] = True if "(IF)" in value else False
            relation["transitive"] = True if "(T)" in value else False
            relation["symmetric"] = True if "(S)" in value else False

            # Prefix and uri
            try:
                uri = clean_uri(value)
                uri = uri.split("|")[-1].strip().split(">>")[-1].strip()

                check = uri.split(":")[1]  # Check if error in text

                prefix = uri.split(":")[0].strip()
                uri = uri.split(":")[-1].strip()

                check = prefix[0]  # Check if error in text
                check = uri[0]  # Check if error in text

                uri = re.sub(" ", "", uri)

                relation["prefix"] = prefix
                relation["uri"] = uri
                relation["label"] = create_label(relation["uri"], "property")
            except:
                error = {
                    "message": "Problems in the text of the arrow",
                    "shape_id": id,
                    "value": value
                }
                self.errors["Arrows"].append(error)
                continue

            # Cardinality restriction evaluation
            try:
                max_min_card = re.findall("\(([0-9][^)]+)\)", value)
                max_min_card = max_min_card[-1] if len(
                    max_min_card) > 0 else None

                if max_min_card is None:
                    relation["min_cardinality"] = None
                    relation["max_cardinality"] = None
                else:
                    max_min_card = max_min_card.split("..")
                    relation["min_cardinality"] = max_min_card[0]
                    relation["max_cardinality"] = max_min_card[1]
            except:
                error = {
                    "message": "Problems in cardinality definition",
                    "shape_id": id,
                    "value": value
                }
                self.errors["Arrows"].append(error)
                continue

            if relation["min_cardinality"] == "0":
                relation["min_cardinality"] = None

            if relation["max_cardinality"] == "N":
                relation["max_cardinality"] = None

            if relation["min_cardinality"] == relation["max_cardinality"]:
                relation["cardinality"] = relation["min_cardinality"]
                relation["max_cardinality"] = None
                relation["min_cardinality"] = None
            else:
                relation["cardinality"] = None

            relation["type"] = "owl:ObjectProperty"

            self.relations[id] = relation

        return self.relations