Esempio n. 1
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem,
                    lenient: bool, trusted: bool):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)
        lenient_ids = set()

        context = etree.iterparse(source,
                                  events=("start", "end"),
                                  huge_tree=trusted)

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS

                        # If a type was not found, ignore it if lenient, else raise an exception
                        try:
                            fs = self._parse_feature_structure(
                                typesystem, elem, children)
                            feature_structures[fs.xmiID] = fs
                        except TypeNotFoundError as e:
                            if not lenient:
                                raise e

                            warnings.warn(e.message)
                            xmiID = elem.attrib.get(
                                "{http://www.omg.org/XMI}id", None)
                            if xmiID:
                                lenient_ids.add(int(xmiID))

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        referenced_fs = set()
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    value = getattr(fs, feature_name)
                    sofa = sofas[value]
                    setattr(fs, feature_name, sofa)
                    continue

                if (typesystem.is_primitive(feature.rangeTypeName)
                        or typesystem.is_primitive_collection(
                            feature.rangeTypeName)
                        or typesystem.is_primitive_collection(fs.type)):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(fs.type, feature):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                        referenced_fs.add(target_id)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    referenced_fs.add(target_id)
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem=typesystem, lenient=lenient)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")

                # We need to make sure that the sofa gets the real xmi, see #155
                view.get_sofa().xmiID = sofa.xmiID
            else:
                view = cas.create_view(sofa.sofaID,
                                       xmiID=sofa.xmiID,
                                       sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            for member_id in proto_view.members:
                # We ignore ids of feature structures for which we do not have a type
                if member_id in lenient_ids:
                    continue

                fs = feature_structures[member_id]

                # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints
                if typesystem.is_instance_of(fs.type, "uima.tcas.Annotation"):
                    try:
                        fs.begin = sofa._offset_converter.uima_to_cassis(
                            fs.begin)
                        fs.end = sofa._offset_converter.uima_to_cassis(fs.end)
                    except KeyError:
                        pass
                view.add_annotation(fs, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas
Esempio n. 2
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)

        context = etree.iterparse(source, events=("start", "end"))

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS
                        fs = self._parse_feature_structure(
                            typesystem, elem, children)
                        feature_structures[fs.xmiID] = fs

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        referenced_fs = set()
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    value = getattr(fs, feature_name)
                    sofa = sofas[value]
                    setattr(fs, feature_name, sofa)
                    continue

                if (typesystem.is_primitive(feature.rangeTypeName)
                        or typesystem.is_primitive_collection(
                            feature.rangeTypeName)
                        or typesystem.is_primitive_collection(fs.type)):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(fs.type, feature):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                        referenced_fs.add(target_id)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    referenced_fs.add(target_id)
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem=typesystem)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID,
                                       xmiID=sofa.xmiID,
                                       sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            # Patch: Rewrite offsets
            if view.sofa_string is not None:
                mapper = JavaOffsetsMapper(view.sofa_string)
            else:
                mapper = None
            # End patch

            for member_id in proto_view.members:
                annotation = feature_structures[member_id]

                # Patch: Rewrite offsets
                if mapper and \
                        hasattr(annotation, "begin") and\
                        hasattr(annotation, "end") and\
                        annotation.begin is not None and\
                        annotation.end is not None:

                    annotation.begin = mapper.java_to_python_begin(
                        annotation.begin)
                    try:
                        annotation.end = mapper.java_to_python_end(
                            annotation.end)
                    except Exception as e:
                        print("\nYYYYYYYYYY", annotation.end)
                        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx",
                              mapper.text)

                # End patch

                view.add_annotation(annotation, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas
Esempio n. 3
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)
        lenient_ids = set()

        context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted)

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(typesystem, elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as

                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>

                In order to parse this with an incremental XML parser, we need to employ
                a simple state machine. It is depicted in the following.

                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(f"Invalid state transition: [{state}] 'start'")
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS

                        # If a type was not found, ignore it if lenient, else raise an exception
                        try:
                            fs = self._parse_feature_structure(typesystem, elem, children)
                            feature_structures[fs.xmiID] = fs
                        except TypeNotFoundError as e:
                            if not lenient:
                                raise e

                            warnings.warn(e.message)
                            xmiID = elem.attrib.get("{http://www.omg.org/XMI}id", None)
                            if xmiID:
                                lenient_ids.add(int(xmiID))

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(f"Invalid state transition: [{state}] 'end'")
                else:
                    raise RuntimeError(f"Invalid XML event: [{event}]")

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type.name)

            for feature in t.all_features:
                feature_name = feature.name
                value = fs[feature_name]

                if feature_name == "sofa":
                    fs[feature_name] = sofas[value]
                    continue

                if typesystem.is_instance_of(fs.type.name, TYPE_NAME_STRING_ARRAY):
                    # We already parsed string arrays to a Python list of string
                    # before, so we do not need to work more on this
                    continue
                elif typesystem.is_primitive(feature.rangeType):
                    fs[feature_name] = self._parse_primitive_value(feature.rangeType, value)
                    continue
                elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
                    # Separately rendered arrays (typically used with multipleReferencesAllowed = True)
                    fs[feature_name] = self._parse_primitive_array(fs.type, value)
                elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
                    # Array feature rendered inline (multipleReferencesAllowed = False|None)
                    # We also end up here for array features that were rendered as child elements. No need to parse
                    # them again, so we check if the value is still a string (i.e. attribute value) and only then
                    # process it
                    if isinstance(value, str):
                        FSType = feature.rangeType
                        fs[feature_name] = FSType(elements=self._parse_primitive_array(feature.rangeType, value))
                elif typesystem.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed:
                    # Array feature rendered inline (multipleReferencesAllowed = False|None)
                    # We also end up here for array features that were rendered as child elements. No need to parse
                    # them again, so we check if the value is still a string (i.e. attribute value) and only then
                    # process it
                    if isinstance(value, str):
                        fs[feature_name] = self._parse_primitive_list(feature.rangeType, value)
                else:
                    # Resolve references here
                    if value is None:
                        continue

                    # Resolve references
                    if fs.type.name == TYPE_NAME_FS_ARRAY or (
                        feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed
                    ):
                        # An array of references is a list of integers separated
                        # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                        targets = []
                        for ref in value.split():
                            target_id = int(ref)
                            target = feature_structures[target_id]
                            targets.append(target)

                        if feature.rangeType.name == TYPE_NAME_FS_ARRAY:
                            # Wrap inline array into the appropriate array object
                            ArrayType = typesystem.get_type(TYPE_NAME_FS_ARRAY)
                            targets = ArrayType(elements=targets)

                        fs[feature_name] = targets
                    elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed:
                        # Array feature rendered inline (multipleReferencesAllowed = False|None)
                        # We also end up here for array features that were rendered as child elements. No need to parse
                        # them again, so we check if the value is still a string (i.e. attribute value) and only then
                        # process it
                        if isinstance(value, list) or isinstance(value, str):
                            fs[feature_name] = self._parse_fs_list(feature_structures, feature.rangeType, value)
                    else:
                        target_id = int(value)
                        fs[feature_name] = feature_structures[target_id]

        cas = Cas(typesystem=typesystem, lenient=lenient)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")

                # We need to make sure that the sofa gets the real xmi, see #155
                view.get_sofa().xmiID = sofa.xmiID
            else:
                view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            for member_id in proto_view.members:
                # We ignore ids of feature structures for which we do not have a type
                if member_id in lenient_ids:
                    continue

                fs = feature_structures[member_id]

                # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints
                if typesystem.is_instance_of(fs.type.name, "uima.tcas.Annotation"):
                    fs.begin = sofa._offset_converter.external_to_python(fs.begin)
                    fs.end = sofa._offset_converter.external_to_python(fs.end)

                view.add(fs, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas
Esempio n. 4
0
    def deserialize(self,
                    source: Union[IO, str],
                    typesystem: Optional[TypeSystem] = None) -> Cas:
        if isinstance(source, str):
            data = json.loads(source)
        else:
            data = json.load(source)

        self._max_xmi_id = 0
        self._max_sofa_num = 0
        self._post_processors = []

        embedded_typesystem = TypeSystem()
        json_typesystem = data.get(TYPES_FIELD)
        for type_name, json_type in json_typesystem.items():
            self._parse_type(embedded_typesystem, type_name, json_type)

        typesystem = merge_typesystems(typesystem, embedded_typesystem)

        cas = Cas(typesystem=typesystem)

        feature_structures = {}
        json_feature_structures = data.get(FEATURE_STRUCTURES_FIELD)
        if isinstance(json_feature_structures, list):
            for json_fs in json_feature_structures:
                if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA:
                    fs_id = json_fs.get(ID_FIELD)
                    fs = self._parse_sofa(cas, fs_id, json_fs,
                                          feature_structures)
                else:
                    fs_id = json_fs.get(ID_FIELD)
                    fs = self._parse_feature_structure(typesystem, fs_id,
                                                       json_fs,
                                                       feature_structures)
                feature_structures[fs.xmiID] = fs

        if isinstance(json_feature_structures, dict):
            for fs_id, json_fs in json_feature_structures.items():
                if json_fs.get(TYPE_FIELD) == TYPE_NAME_SOFA:
                    fs_id = int(fs_id)
                    fs = self._parse_sofa(cas, fs_id, json_fs,
                                          feature_structures)
                else:
                    fs_id = int(fs_id)
                    fs = self._parse_feature_structure(typesystem, fs_id,
                                                       json_fs,
                                                       feature_structures)
                feature_structures[fs.xmiID] = fs

        for post_processor in self._post_processors:
            post_processor()

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        # At this point all views for which we have a sofa with a known ID and sofaNum have already been created
        # as part of parsing the feature structures. Thus, if there are any views remaining that are only declared
        # in the views section, we just create them with auto-assigned IDs
        json_views = data.get(VIEWS_FIELD)
        for view_name, json_view in json_views.items():
            self._parse_view(cas, view_name, json_view, feature_structures)

        return cas