Exemple #1
0
def test_create_view_throws_if_view_already_exists():
    cas = Cas()
    cas.create_view("testView")

    with pytest.raises(ValueError,
                       message=r"A view with name [testView] already exists!"):
        cas.create_view("testView")
Exemple #2
0
def test_get_view_finds_existing_view():
    cas = Cas()
    cas.create_view("testView")
    cas.sofa_string = "Initial"

    view = cas.get_view("testView")
    view.sofa_string = "testView42"

    sofa = view.get_sofa()
    attr.validate(sofa)
    assert sofa.sofaID == "testView"
    assert cas.sofa_string == "Initial"
    assert view.sofa_string == "testView42"
Exemple #3
0
def test_create_view_creates_view():
    cas = Cas()

    view = cas.create_view("testView")
    sofa = view.get_sofa()

    attr.validate(sofa)
    assert sofa.sofaID == "testView"
Exemple #4
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_NULL = NS_CAS + "NULL"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        sofas = []
        views = {}
        annotations = {}

        context = etree.iterparse(source, events=("end",))

        for event, elem in context:
            assert event == "end"

            if elem.tag == TAG_XMI:
                # Ignore the closing 'xmi:XMI' tag
                pass
            elif elem.tag == TAG_CAS_NULL:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                sofa = self._parse_sofa(elem)
                sofas.append(sofa)
            elif elem.tag == TAG_CAS_VIEW:
                proto_view = self._parse_view(elem)
                views[proto_view.sofa] = proto_view
            else:
                annotation = self._parse_annotation(typesystem, elem)
                annotations[annotation.xmiID] = annotation

            # Free already processed elements from memory
            self._clear_elem(elem)

        if len(sofas) != len(views):
            raise RuntimeError("Number of views and sofas is not equal!")

        cas = Cas()
        for sofa in sofas:
            proto_view = views[sofa.xmiID]

            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            for member_id in proto_view.members:
                annotation = annotations[member_id]

                view.add_annotation(annotation)

        return cas
Exemple #5
0
def test_removing_throws_if_fs_in_other_view(small_typesystem_xml, tokens,
                                             sentences):
    cas = Cas(typesystem=load_typesystem(small_typesystem_xml))
    cas.add_annotations(tokens)

    view = cas.create_view("testView")

    with pytest.raises(ValueError):
        view.remove_annotation(tokens[0])
Exemple #6
0
def test_select_only_returns_annotations_of_current_view(tokens, sentences, small_typesystem_xml):
    cas = Cas(typesystem=load_typesystem(small_typesystem_xml))
    cas.add_annotations(tokens)
    view = cas.create_view("testView")
    view.add_annotations(sentences)

    actual_annotations_in_initial_view = list(cas.get_view("_InitialView").select_all())
    actual_annotations_in_test_view = list(cas.get_view("testView").select_all())

    assert tokens == actual_annotations_in_initial_view
    assert sentences == actual_annotations_in_test_view
Exemple #7
0
def test_removing_removes_from_view(small_typesystem_xml, tokens, sentences):
    annotations = tokens + sentences
    cas = Cas(typesystem=load_typesystem(small_typesystem_xml))
    view = cas.create_view("testView")

    cas.add_annotations(annotations)
    view.add_annotations(annotations)

    for annotation in annotations:
        cas.remove_annotation(annotation)

    assert set(cas.select("uima.tcas.Annotation")) == set()
    assert set(view.select("uima.tcas.Annotation")) == set(annotations)
Exemple #8
0
    def _get_or_create_view(self,
                            cas: Cas,
                            view_name: str,
                            fs_id: Optional[int] = None,
                            sofa_num: Optional[int] = None) -> Cas:
        if view_name == NAME_DEFAULT_SOFA:
            view = cas.get_view(NAME_DEFAULT_SOFA)

            # We need to make sure that the sofa gets the real xmi, see #155
            if fs_id is not None:
                view.get_sofa().xmiID = fs_id

            return view
        else:
            return cas.create_view(view_name, xmiID=fs_id, sofaNum=sofa_num)
Exemple #9
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem,
                    lenient: bool, trusted: bool):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)
        lenient_ids = set()

        context = etree.iterparse(source,
                                  events=("start", "end"),
                                  huge_tree=trusted)

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS

                        # If a type was not found, ignore it if lenient, else raise an exception
                        try:
                            fs = self._parse_feature_structure(
                                typesystem, elem, children)
                            feature_structures[fs.xmiID] = fs
                        except TypeNotFoundError as e:
                            if not lenient:
                                raise e

                            warnings.warn(e.message)
                            xmiID = elem.attrib.get(
                                "{http://www.omg.org/XMI}id", None)
                            if xmiID:
                                lenient_ids.add(int(xmiID))

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        referenced_fs = set()
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    value = getattr(fs, feature_name)
                    sofa = sofas[value]
                    setattr(fs, feature_name, sofa)
                    continue

                if (typesystem.is_primitive(feature.rangeTypeName)
                        or typesystem.is_primitive_collection(
                            feature.rangeTypeName)
                        or typesystem.is_primitive_collection(fs.type)):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(fs.type, feature):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                        referenced_fs.add(target_id)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    referenced_fs.add(target_id)
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem=typesystem, lenient=lenient)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")

                # We need to make sure that the sofa gets the real xmi, see #155
                view.get_sofa().xmiID = sofa.xmiID
            else:
                view = cas.create_view(sofa.sofaID,
                                       xmiID=sofa.xmiID,
                                       sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            for member_id in proto_view.members:
                # We ignore ids of feature structures for which we do not have a type
                if member_id in lenient_ids:
                    continue

                fs = feature_structures[member_id]

                # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints
                if typesystem.is_instance_of(fs.type, "uima.tcas.Annotation"):
                    try:
                        fs.begin = sofa._offset_converter.uima_to_cassis(
                            fs.begin)
                        fs.end = sofa._offset_converter.uima_to_cassis(fs.end)
                    except KeyError:
                        pass
                view.add_annotation(fs, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas
Exemple #10
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_NULL = NS_CAS + "NULL"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        sofas = []
        views = {}
        annotations = {}

        depth = 0
        context = etree.iterparse(source, events=("end", "start"))
        for event, elem in context:
            #assert event == "end"

            if event == "start":
                depth += 1

                if elem.tag == TAG_XMI:
                    # Ignore the closing 'xmi:XMI' tag
                    pass
                elif elem.tag == TAG_CAS_NULL:
                    pass
                elif elem.tag == TAG_CAS_SOFA:
                    sofa = self._parse_sofa(elem)
                    sofas.append(sofa)
                elif elem.tag == TAG_CAS_VIEW:
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
                else:
                    if depth == 2:
                        annotation = self._parse_annotation(typesystem, elem)
                        annotations[annotation.xmiID] = annotation

                    else:
                        pass

            elif event == "end":
                depth -= 1
                # Free already processed elements from memory
                self._clear_elem(elem)

            else:
                raise RuntimeError("Invalid parsing event '%s'!" % event)

        if len(sofas) != len(views):
            raise RuntimeError("Number of views and sofas is not equal!")

        for annotation in annotations.values():
            ann_type = typesystem.get_type(annotation.type)

            all_features = {}
            all_features.update(ann_type._inherited_features)
            all_features.update(ann_type._features)

            for attribute in annotation.__slots__:
                feature = all_features.get(attribute)

                if feature:
                    feat_type = typesystem.get_type(feature.rangeTypeName)

                    if feat_type.name == 'uima.tcas.Annotation' or feat_type.supertypeName == 'uima.tcas.Annotation':
                        feat_xml_id = annotation.__getattribute__(attribute)

                        if feat_xml_id:
                            feat_ann = annotations.get(int(feat_xml_id))

                            if feat_ann:

                                feat_ancestors = [feat_type.name]
                                while feat_ancestors:
                                    feat_ancestor = feat_ancestors.pop(0)

                                    if feat_ann.type == feat_ancestor:
                                        annotation.__setattr__(
                                            attribute, feat_ann)
                                        feat_ancestors.clear()

                                    else:
                                        feat_ancestors.extend(
                                            typesystem.get_type(
                                                feat_ancestor)._children)

        cas = Cas()
        for sofa in sofas:
            proto_view = views[sofa.xmiID]

            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            for member_id in proto_view.members:
                annotation = annotations[member_id]

                view.add_annotation(annotation)

        return cas
Exemple #11
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_NULL = NS_CAS + "NULL"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = []
        views = {}
        feature_structures = {}
        children = defaultdict(list)

        context = etree.iterparse(source, events=("start", "end"))

        state = OUTSIDE_FS

        for event, elem in context:
            if elem.tag == TAG_XMI or elem.tag == TAG_CAS_NULL:
                pass
                # Ignore the 'xmi:XMI' and 'cas:NULL' elements
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas.append(sofa)
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS
                        fs = self._parse_feature_structure(
                            typesystem, elem, children)
                        feature_structures[fs.xmiID] = fs

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        if len(sofas) != len(views):
            raise RuntimeError("Number of views and sofas is not equal!")

        # Post-process feature values
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    continue

                if typesystem.is_primitive(
                        feature.rangeTypeName
                ) or typesystem.is_primitive_collection(feature.rangeTypeName):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(feature.rangeTypeName):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem)
        for sofa in sofas:
            proto_view = views[sofa.xmiID]

            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            for member_id in proto_view.members:
                annotation = feature_structures[member_id]

                view.add_annotation(annotation)

        return cas
Exemple #12
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)
        lenient_ids = set()

        context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted)

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(typesystem, elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as

                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>

                In order to parse this with an incremental XML parser, we need to employ
                a simple state machine. It is depicted in the following.

                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(f"Invalid state transition: [{state}] 'start'")
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS

                        # If a type was not found, ignore it if lenient, else raise an exception
                        try:
                            fs = self._parse_feature_structure(typesystem, elem, children)
                            feature_structures[fs.xmiID] = fs
                        except TypeNotFoundError as e:
                            if not lenient:
                                raise e

                            warnings.warn(e.message)
                            xmiID = elem.attrib.get("{http://www.omg.org/XMI}id", None)
                            if xmiID:
                                lenient_ids.add(int(xmiID))

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(f"Invalid state transition: [{state}] 'end'")
                else:
                    raise RuntimeError(f"Invalid XML event: [{event}]")

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type.name)

            for feature in t.all_features:
                feature_name = feature.name
                value = fs[feature_name]

                if feature_name == "sofa":
                    fs[feature_name] = sofas[value]
                    continue

                if typesystem.is_instance_of(fs.type.name, TYPE_NAME_STRING_ARRAY):
                    # We already parsed string arrays to a Python list of string
                    # before, so we do not need to work more on this
                    continue
                elif typesystem.is_primitive(feature.rangeType):
                    fs[feature_name] = self._parse_primitive_value(feature.rangeType, value)
                    continue
                elif typesystem.is_primitive_array(fs.type) and feature_name == "elements":
                    # Separately rendered arrays (typically used with multipleReferencesAllowed = True)
                    fs[feature_name] = self._parse_primitive_array(fs.type, value)
                elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed:
                    # Array feature rendered inline (multipleReferencesAllowed = False|None)
                    # We also end up here for array features that were rendered as child elements. No need to parse
                    # them again, so we check if the value is still a string (i.e. attribute value) and only then
                    # process it
                    if isinstance(value, str):
                        FSType = feature.rangeType
                        fs[feature_name] = FSType(elements=self._parse_primitive_array(feature.rangeType, value))
                elif typesystem.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed:
                    # Array feature rendered inline (multipleReferencesAllowed = False|None)
                    # We also end up here for array features that were rendered as child elements. No need to parse
                    # them again, so we check if the value is still a string (i.e. attribute value) and only then
                    # process it
                    if isinstance(value, str):
                        fs[feature_name] = self._parse_primitive_list(feature.rangeType, value)
                else:
                    # Resolve references here
                    if value is None:
                        continue

                    # Resolve references
                    if fs.type.name == TYPE_NAME_FS_ARRAY or (
                        feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed
                    ):
                        # An array of references is a list of integers separated
                        # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                        targets = []
                        for ref in value.split():
                            target_id = int(ref)
                            target = feature_structures[target_id]
                            targets.append(target)

                        if feature.rangeType.name == TYPE_NAME_FS_ARRAY:
                            # Wrap inline array into the appropriate array object
                            ArrayType = typesystem.get_type(TYPE_NAME_FS_ARRAY)
                            targets = ArrayType(elements=targets)

                        fs[feature_name] = targets
                    elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed:
                        # Array feature rendered inline (multipleReferencesAllowed = False|None)
                        # We also end up here for array features that were rendered as child elements. No need to parse
                        # them again, so we check if the value is still a string (i.e. attribute value) and only then
                        # process it
                        if isinstance(value, list) or isinstance(value, str):
                            fs[feature_name] = self._parse_fs_list(feature_structures, feature.rangeType, value)
                    else:
                        target_id = int(value)
                        fs[feature_name] = feature_structures[target_id]

        cas = Cas(typesystem=typesystem, lenient=lenient)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")

                # We need to make sure that the sofa gets the real xmi, see #155
                view.get_sofa().xmiID = sofa.xmiID
            else:
                view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            for member_id in proto_view.members:
                # We ignore ids of feature structures for which we do not have a type
                if member_id in lenient_ids:
                    continue

                fs = feature_structures[member_id]

                # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints
                if typesystem.is_instance_of(fs.type.name, "uima.tcas.Annotation"):
                    fs.begin = sofa._offset_converter.external_to_python(fs.begin)
                    fs.end = sofa._offset_converter.external_to_python(fs.end)

                view.add(fs, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas
Exemple #13
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = {}
        views = {}
        feature_structures = {}
        children = defaultdict(list)

        context = etree.iterparse(source, events=("start", "end"))

        state = OUTSIDE_FS
        self._max_xmi_id = 0
        self._max_sofa_num = 0

        for event, elem in context:
            # Ignore the 'xmi:XMI'
            if elem.tag == TAG_XMI:
                pass
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas[sofa.xmiID] = sofa
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS
                        fs = self._parse_feature_structure(
                            typesystem, elem, children)
                        feature_structures[fs.xmiID] = fs

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        # Post-process feature values
        referenced_fs = set()
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    value = getattr(fs, feature_name)
                    sofa = sofas[value]
                    setattr(fs, feature_name, sofa)
                    continue

                if (typesystem.is_primitive(feature.rangeTypeName)
                        or typesystem.is_primitive_collection(
                            feature.rangeTypeName)
                        or typesystem.is_primitive_collection(fs.type)):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(fs.type, feature):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                        referenced_fs.add(target_id)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    referenced_fs.add(target_id)
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem=typesystem)
        for sofa in sofas.values():
            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID,
                                       xmiID=sofa.xmiID,
                                       sofaNum=sofa.sofaNum)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            # If a sofa has no members, then UIMA might omit the view. In that case,
            # we create an empty view for it.
            if sofa.xmiID in views:
                proto_view = views[sofa.xmiID]
            else:
                proto_view = ProtoView(sofa.xmiID)

            # Patch: Rewrite offsets
            if view.sofa_string is not None:
                mapper = JavaOffsetsMapper(view.sofa_string)
            else:
                mapper = None
            # End patch

            for member_id in proto_view.members:
                annotation = feature_structures[member_id]

                # Patch: Rewrite offsets
                if mapper and \
                        hasattr(annotation, "begin") and\
                        hasattr(annotation, "end") and\
                        annotation.begin is not None and\
                        annotation.end is not None:

                    annotation.begin = mapper.java_to_python_begin(
                        annotation.begin)
                    try:
                        annotation.end = mapper.java_to_python_end(
                            annotation.end)
                    except Exception as e:
                        print("\nYYYYYYYYYY", annotation.end)
                        print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx",
                              mapper.text)

                # End patch

                view.add_annotation(annotation, keep_id=True)

        cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1)
        cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1)

        return cas