def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[str, List[str]]): # Strip the http prefix, replace / with ., remove the ecore part # TODO: Error checking typename = elem.tag[9:].replace("/", ".").replace("ecore}", "").strip() AnnotationType = typesystem.get_type(typename) attributes = dict(elem.attrib) attributes.update(children) # Map the xmi:id attribute to xmiID attributes["xmiID"] = int(attributes.pop("{http://www.omg.org/XMI}id")) if "begin" in attributes: attributes["begin"] = int(attributes["begin"]) if "end" in attributes: attributes["end"] = int(attributes["end"]) if "sofa" in attributes: attributes["sofa"] = int(attributes["sofa"]) # Remap features that use a reserved Python name if "self" in attributes: attributes["self_"] = attributes.pop("self") if "type" in attributes: attributes["type_"] = attributes.pop("type") self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) return AnnotationType(**attributes)
def _parse_annotation(self, typesystem: TypeSystem, elem): # Strip the http prefix, replace / with ., remove the ecore part # TODO: Error checking typename = elem.tag if typename.startswith("{"): parts = re.findall("(?:/|})([^}/.]+)", typename) typename = ".".join(parts) AnnotationType = typesystem.get_type(typename) attributes = dict(elem.attrib) # Map the xmi:id attribute to xmiID attributes["xmiID"] = int(attributes.pop("{http://www.omg.org/XMI}id")) if "begin" in attributes: attributes["begin"] = int(attributes["begin"]) if "end" in attributes: attributes["end"] = int(attributes["end"]) if "sofa" in attributes: attributes["sofa"] = int(attributes["sofa"]) return AnnotationType(**attributes)
def _parse_sofa(self, typesystem: TypeSystem, elem) -> Sofa: attributes = dict(elem.attrib) attributes["xmiID"] = int(attributes.pop("{http://www.omg.org/XMI}id")) attributes["sofaNum"] = int(attributes["sofaNum"]) attributes["type"] = typesystem.get_type(TYPE_NAME_SOFA) self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) self._max_sofa_num = max(attributes["sofaNum"], self._max_sofa_num) return Sofa(**attributes)
def _parse_annotation(self, typesystem: TypeSystem, elem): # Strip the http prefix, replace / with ., remove the ecore part # TODO: Error checking typename = elem.tag[9:].replace('/', '.').replace('ecore}', '') AnnotationType = typesystem.get_type(typename) attributes = dict(elem.attrib) # Map the xmi:id attribute to xmiID attributes['xmiID'] = int(attributes.pop('{http://www.omg.org/XMI}id')) attributes['begin'] = int(attributes['begin']) attributes['end'] = int(attributes['end']) attributes['sofa'] = int(attributes['sofa']) return AnnotationType(**attributes)
def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[str, List[str]]): # Strip the http prefix, replace / with ., remove the ecore part # TODO: Error checking type_name: str = elem.tag[9:].replace("/", ".").replace("ecore}", "").strip() if type_name.startswith("uima.noNamespace."): type_name = type_name[17:] AnnotationType = typesystem.get_type(type_name) attributes = dict(elem.attrib) attributes.update(children) # Map the xmi:id attribute to xmiID attributes["xmiID"] = int(attributes.pop("{http://www.omg.org/XMI}id")) if "begin" in attributes: attributes["begin"] = int(attributes["begin"]) if "end" in attributes: attributes["end"] = int(attributes["end"]) if "sofa" in attributes: attributes["sofa"] = int(attributes["sofa"]) # Remap features that use a reserved Python name if "self" in attributes: attributes["self_"] = attributes.pop("self") if "type" in attributes: attributes["type_"] = attributes.pop("type") # Arrays which were represented as nested elements in the XMI have so far have only been parsed into a Python # arrays. Now we convert them to proper UIMA arrays/lists if not typesystem.is_primitive_array(type_name): for feature_name, feature_value in children.items(): feature = AnnotationType.get_feature(feature_name) if typesystem.is_primitive_array(feature.rangeType): ArrayType = feature.rangeType attributes[feature_name] = ArrayType(elements=attributes[feature_name]) if typesystem.is_primitive_list(feature.rangeType): attributes[feature_name] = self._parse_primitive_list(feature.rangeType, attributes[feature_name]) self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) return AnnotationType(**attributes)
def _parse_annotation(self, typesystem: TypeSystem, elem): # Strip the http prefix, replace / with ., remove the ecore part # TODO: Error checking typename = elem.tag[9:].replace("/", ".").replace("ecore}", "") AnnotationType = typesystem.get_type(typename) attributes = dict(elem.attrib) # Map the xmi:id attribute to xmiID attributes["xmiID"] = int(attributes.pop("{http://www.omg.org/XMI}id")) if "begin" in attributes: attributes["begin"] = int(attributes["begin"]) if "end" in attributes: attributes["end"] = int(attributes["end"]) if "sofa" in attributes: attributes["sofa"] = int(attributes["sofa"]) return AnnotationType(**attributes)
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) lenient_ids = set() context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS # If a type was not found, ignore it if lenient, else raise an exception try: fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs except TypeNotFoundError as e: if not lenient: raise e warnings.warn(e.message) xmiID = elem.attrib.get( "{http://www.omg.org/XMI}id", None) if xmiID: lenient_ids.add(int(xmiID)) children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values referenced_fs = set() for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": value = getattr(fs, feature_name) sofa = sofas[value] setattr(fs, feature_name, sofa) continue if (typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection( feature.rangeTypeName) or typesystem.is_primitive_collection(fs.type)): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(fs.type, feature): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) referenced_fs.add(target_id) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] referenced_fs.add(target_id) setattr(fs, feature_name, target) cas = Cas(typesystem=typesystem, lenient=lenient) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") # We need to make sure that the sofa gets the real xmi, see #155 view.get_sofa().xmiID = sofa.xmiID else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) for member_id in proto_view.members: # We ignore ids of feature structures for which we do not have a type if member_id in lenient_ids: continue fs = feature_structures[member_id] # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints if typesystem.is_instance_of(fs.type, "uima.tcas.Annotation"): try: fs.begin = sofa._offset_converter.uima_to_cassis( fs.begin) fs.end = sofa._offset_converter.uima_to_cassis(fs.end) except KeyError: pass view.add_annotation(fs, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_NULL = NS_CAS + "NULL" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" sofas = [] views = {} annotations = {} depth = 0 context = etree.iterparse(source, events=("end", "start")) for event, elem in context: #assert event == "end" if event == "start": depth += 1 if elem.tag == TAG_XMI: # Ignore the closing 'xmi:XMI' tag pass elif elem.tag == TAG_CAS_NULL: pass elif elem.tag == TAG_CAS_SOFA: sofa = self._parse_sofa(elem) sofas.append(sofa) elif elem.tag == TAG_CAS_VIEW: proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: if depth == 2: annotation = self._parse_annotation(typesystem, elem) annotations[annotation.xmiID] = annotation else: pass elif event == "end": depth -= 1 # Free already processed elements from memory self._clear_elem(elem) else: raise RuntimeError("Invalid parsing event '%s'!" % event) if len(sofas) != len(views): raise RuntimeError("Number of views and sofas is not equal!") for annotation in annotations.values(): ann_type = typesystem.get_type(annotation.type) all_features = {} all_features.update(ann_type._inherited_features) all_features.update(ann_type._features) for attribute in annotation.__slots__: feature = all_features.get(attribute) if feature: feat_type = typesystem.get_type(feature.rangeTypeName) if feat_type.name == 'uima.tcas.Annotation' or feat_type.supertypeName == 'uima.tcas.Annotation': feat_xml_id = annotation.__getattribute__(attribute) if feat_xml_id: feat_ann = annotations.get(int(feat_xml_id)) if feat_ann: feat_ancestors = [feat_type.name] while feat_ancestors: feat_ancestor = feat_ancestors.pop(0) if feat_ann.type == feat_ancestor: annotation.__setattr__( attribute, feat_ann) feat_ancestors.clear() else: feat_ancestors.extend( typesystem.get_type( feat_ancestor)._children) cas = Cas() for sofa in sofas: proto_view = views[sofa.xmiID] if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") else: view = cas.create_view(sofa.sofaID) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType for member_id in proto_view.members: annotation = annotations[member_id] view.add_annotation(annotation) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_NULL = NS_CAS + "NULL" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = [] views = {} feature_structures = {} children = defaultdict(list) context = etree.iterparse(source, events=("start", "end")) state = OUTSIDE_FS for event, elem in context: if elem.tag == TAG_XMI or elem.tag == TAG_CAS_NULL: pass # Ignore the 'xmi:XMI' and 'cas:NULL' elements elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas.append(sofa) elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) if len(sofas) != len(views): raise RuntimeError("Number of views and sofas is not equal!") # Post-process feature values for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": continue if typesystem.is_primitive( feature.rangeTypeName ) or typesystem.is_primitive_collection(feature.rangeTypeName): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(feature.rangeTypeName): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] setattr(fs, feature_name, target) cas = Cas(typesystem) for sofa in sofas: proto_view = views[sofa.xmiID] if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") else: view = cas.create_view(sofa.sofaID) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType for member_id in proto_view.members: annotation = feature_structures[member_id] view.add_annotation(annotation) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) lenient_ids = set() context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(typesystem, elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError(f"Invalid state transition: [{state}] 'start'") elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS # If a type was not found, ignore it if lenient, else raise an exception try: fs = self._parse_feature_structure(typesystem, elem, children) feature_structures[fs.xmiID] = fs except TypeNotFoundError as e: if not lenient: raise e warnings.warn(e.message) xmiID = elem.attrib.get("{http://www.omg.org/XMI}id", None) if xmiID: lenient_ids.add(int(xmiID)) children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError(f"Invalid state transition: [{state}] 'end'") else: raise RuntimeError(f"Invalid XML event: [{event}]") # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type.name) for feature in t.all_features: feature_name = feature.name value = fs[feature_name] if feature_name == "sofa": fs[feature_name] = sofas[value] continue if typesystem.is_instance_of(fs.type.name, TYPE_NAME_STRING_ARRAY): # We already parsed string arrays to a Python list of string # before, so we do not need to work more on this continue elif typesystem.is_primitive(feature.rangeType): fs[feature_name] = self._parse_primitive_value(feature.rangeType, value) continue elif typesystem.is_primitive_array(fs.type) and feature_name == "elements": # Separately rendered arrays (typically used with multipleReferencesAllowed = True) fs[feature_name] = self._parse_primitive_array(fs.type, value) elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, str): FSType = feature.rangeType fs[feature_name] = FSType(elements=self._parse_primitive_array(feature.rangeType, value)) elif typesystem.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, str): fs[feature_name] = self._parse_primitive_list(feature.rangeType, value) else: # Resolve references here if value is None: continue # Resolve references if fs.type.name == TYPE_NAME_FS_ARRAY or ( feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed ): # An array of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) if feature.rangeType.name == TYPE_NAME_FS_ARRAY: # Wrap inline array into the appropriate array object ArrayType = typesystem.get_type(TYPE_NAME_FS_ARRAY) targets = ArrayType(elements=targets) fs[feature_name] = targets elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, list) or isinstance(value, str): fs[feature_name] = self._parse_fs_list(feature_structures, feature.rangeType, value) else: target_id = int(value) fs[feature_name] = feature_structures[target_id] cas = Cas(typesystem=typesystem, lenient=lenient) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") # We need to make sure that the sofa gets the real xmi, see #155 view.get_sofa().xmiID = sofa.xmiID else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) for member_id in proto_view.members: # We ignore ids of feature structures for which we do not have a type if member_id in lenient_ids: continue fs = feature_structures[member_id] # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints if typesystem.is_instance_of(fs.type.name, "uima.tcas.Annotation"): fs.begin = sofa._offset_converter.external_to_python(fs.begin) fs.end = sofa._offset_converter.external_to_python(fs.end) view.add(fs, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) context = etree.iterparse(source, events=("start", "end")) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values referenced_fs = set() for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": value = getattr(fs, feature_name) sofa = sofas[value] setattr(fs, feature_name, sofa) continue if (typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection( feature.rangeTypeName) or typesystem.is_primitive_collection(fs.type)): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(fs.type, feature): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) referenced_fs.add(target_id) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] referenced_fs.add(target_id) setattr(fs, feature_name, target) cas = Cas(typesystem=typesystem) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) # Patch: Rewrite offsets if view.sofa_string is not None: mapper = JavaOffsetsMapper(view.sofa_string) else: mapper = None # End patch for member_id in proto_view.members: annotation = feature_structures[member_id] # Patch: Rewrite offsets if mapper and \ hasattr(annotation, "begin") and\ hasattr(annotation, "end") and\ annotation.begin is not None and\ annotation.end is not None: annotation.begin = mapper.java_to_python_begin( annotation.begin) try: annotation.end = mapper.java_to_python_end( annotation.end) except Exception as e: print("\nYYYYYYYYYY", annotation.end) print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx", mapper.text) # End patch view.add_annotation(annotation, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas