Beispiel #1
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_NULL = NS_CAS + "NULL"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = []
        views = {}
        feature_structures = {}
        children = defaultdict(list)

        context = etree.iterparse(source, events=("start", "end"))

        state = OUTSIDE_FS

        for event, elem in context:
            if elem.tag == TAG_XMI or elem.tag == TAG_CAS_NULL:
                pass
                # Ignore the 'xmi:XMI' and 'cas:NULL' elements
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas.append(sofa)
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS
                        fs = self._parse_feature_structure(
                            typesystem, elem, children)
                        feature_structures[fs.xmiID] = fs

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        if len(sofas) != len(views):
            raise RuntimeError("Number of views and sofas is not equal!")

        # Post-process feature values
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    continue

                if typesystem.is_primitive(
                        feature.rangeTypeName
                ) or typesystem.is_primitive_collection(feature.rangeTypeName):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(feature.rangeTypeName):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem)
        for sofa in sofas:
            proto_view = views[sofa.xmiID]

            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            for member_id in proto_view.members:
                annotation = feature_structures[member_id]

                view.add_annotation(annotation)

        return cas
Beispiel #2
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem,
                    lenient: bool, trusted: bool):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)
        lenient_ids = set()

        context = etree.iterparse(source,
                                  events=("start", "end"),
                                  huge_tree=trusted)

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS

                        # If a type was not found, ignore it if lenient, else raise an exception
                        try:
                            fs = self._parse_feature_structure(
                                typesystem, elem, children)
                            feature_structures[fs.xmiID] = fs
                        except TypeNotFoundError as e:
                            if not lenient:
                                raise e

                            warnings.warn(e.message)
                            xmiID = elem.attrib.get(
                                "{http://www.omg.org/XMI}id", None)
                            if xmiID:
                                lenient_ids.add(int(xmiID))

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        referenced_fs = set()
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    value = getattr(fs, feature_name)
                    sofa = sofas[value]
                    setattr(fs, feature_name, sofa)
                    continue

                if (typesystem.is_primitive(feature.rangeTypeName)
                        or typesystem.is_primitive_collection(
                            feature.rangeTypeName)
                        or typesystem.is_primitive_collection(fs.type)):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(fs.type, feature):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                        referenced_fs.add(target_id)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    referenced_fs.add(target_id)
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem=typesystem, lenient=lenient)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")

                # We need to make sure that the sofa gets the real xmi, see #155
                view.get_sofa().xmiID = sofa.xmiID
            else:
                view = cas.create_view(sofa.sofaID,
                                       xmiID=sofa.xmiID,
                                       sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            for member_id in proto_view.members:
                # We ignore ids of feature structures for which we do not have a type
                if member_id in lenient_ids:
                    continue

                fs = feature_structures[member_id]

                # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints
                if typesystem.is_instance_of(fs.type, "uima.tcas.Annotation"):
                    try:
                        fs.begin = sofa._offset_converter.uima_to_cassis(
                            fs.begin)
                        fs.end = sofa._offset_converter.uima_to_cassis(fs.end)
                    except KeyError:
                        pass
                view.add_annotation(fs, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas
Beispiel #3
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)

        context = etree.iterparse(source, events=("start", "end"))

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS
                        fs = self._parse_feature_structure(
                            typesystem, elem, children)
                        feature_structures[fs.xmiID] = fs

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        referenced_fs = set()
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    value = getattr(fs, feature_name)
                    sofa = sofas[value]
                    setattr(fs, feature_name, sofa)
                    continue

                if (typesystem.is_primitive(feature.rangeTypeName)
                        or typesystem.is_primitive_collection(
                            feature.rangeTypeName)
                        or typesystem.is_primitive_collection(fs.type)):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(fs.type, feature):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                        referenced_fs.add(target_id)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    referenced_fs.add(target_id)
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem=typesystem)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID,
                                       xmiID=sofa.xmiID,
                                       sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            # Patch: Rewrite offsets
            if view.sofa_string is not None:
                mapper = JavaOffsetsMapper(view.sofa_string)
            else:
                mapper = None
            # End patch

            for member_id in proto_view.members:
                annotation = feature_structures[member_id]

                # Patch: Rewrite offsets
                if mapper and \
                        hasattr(annotation, "begin") and\
                        hasattr(annotation, "end") and\
                        annotation.begin is not None and\
                        annotation.end is not None:

                    annotation.begin = mapper.java_to_python_begin(
                        annotation.begin)
                    try:
                        annotation.end = mapper.java_to_python_end(
                            annotation.end)
                    except Exception as e:
                        print("\nYYYYYYYYYY", annotation.end)
                        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx",
                              mapper.text)

                # End patch

                view.add_annotation(annotation, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas