def _parse_feature_structure(self, typesystem: TypeSystem, elem, children: Dict[str, List[str]]): # Strip the http prefix, replace / with ., remove the ecore part # TODO: Error checking type_name: str = elem.tag[9:].replace("/", ".").replace("ecore}", "").strip() if type_name.startswith("uima.noNamespace."): type_name = type_name[17:] AnnotationType = typesystem.get_type(type_name) attributes = dict(elem.attrib) attributes.update(children) # Map the xmi:id attribute to xmiID attributes["xmiID"] = int(attributes.pop("{http://www.omg.org/XMI}id")) if "begin" in attributes: attributes["begin"] = int(attributes["begin"]) if "end" in attributes: attributes["end"] = int(attributes["end"]) if "sofa" in attributes: attributes["sofa"] = int(attributes["sofa"]) # Remap features that use a reserved Python name if "self" in attributes: attributes["self_"] = attributes.pop("self") if "type" in attributes: attributes["type_"] = attributes.pop("type") # Arrays which were represented as nested elements in the XMI have so far have only been parsed into a Python # arrays. Now we convert them to proper UIMA arrays/lists if not typesystem.is_primitive_array(type_name): for feature_name, feature_value in children.items(): feature = AnnotationType.get_feature(feature_name) if typesystem.is_primitive_array(feature.rangeType): ArrayType = feature.rangeType attributes[feature_name] = ArrayType(elements=attributes[feature_name]) if typesystem.is_primitive_list(feature.rangeType): attributes[feature_name] = self._parse_primitive_list(feature.rangeType, attributes[feature_name]) self._max_xmi_id = max(attributes["xmiID"], self._max_xmi_id) return AnnotationType(**attributes)
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) lenient_ids = set() context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(typesystem, elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError(f"Invalid state transition: [{state}] 'start'") elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS # If a type was not found, ignore it if lenient, else raise an exception try: fs = self._parse_feature_structure(typesystem, elem, children) feature_structures[fs.xmiID] = fs except TypeNotFoundError as e: if not lenient: raise e warnings.warn(e.message) xmiID = elem.attrib.get("{http://www.omg.org/XMI}id", None) if xmiID: lenient_ids.add(int(xmiID)) children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError(f"Invalid state transition: [{state}] 'end'") else: raise RuntimeError(f"Invalid XML event: [{event}]") # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type.name) for feature in t.all_features: feature_name = feature.name value = fs[feature_name] if feature_name == "sofa": fs[feature_name] = sofas[value] continue if typesystem.is_instance_of(fs.type.name, TYPE_NAME_STRING_ARRAY): # We already parsed string arrays to a Python list of string # before, so we do not need to work more on this continue elif typesystem.is_primitive(feature.rangeType): fs[feature_name] = self._parse_primitive_value(feature.rangeType, value) continue elif typesystem.is_primitive_array(fs.type) and feature_name == "elements": # Separately rendered arrays (typically used with multipleReferencesAllowed = True) fs[feature_name] = self._parse_primitive_array(fs.type, value) elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, str): FSType = feature.rangeType fs[feature_name] = FSType(elements=self._parse_primitive_array(feature.rangeType, value)) elif typesystem.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, str): fs[feature_name] = self._parse_primitive_list(feature.rangeType, value) else: # Resolve references here if value is None: continue # Resolve references if fs.type.name == TYPE_NAME_FS_ARRAY or ( feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed ): # An array of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) if feature.rangeType.name == TYPE_NAME_FS_ARRAY: # Wrap inline array into the appropriate array object ArrayType = typesystem.get_type(TYPE_NAME_FS_ARRAY) targets = ArrayType(elements=targets) fs[feature_name] = targets elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, list) or isinstance(value, str): fs[feature_name] = self._parse_fs_list(feature_structures, feature.rangeType, value) else: target_id = int(value) fs[feature_name] = feature_structures[target_id] cas = Cas(typesystem=typesystem, lenient=lenient) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") # We need to make sure that the sofa gets the real xmi, see #155 view.get_sofa().xmiID = sofa.xmiID else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) for member_id in proto_view.members: # We ignore ids of feature structures for which we do not have a type if member_id in lenient_ids: continue fs = feature_structures[member_id] # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints if typesystem.is_instance_of(fs.type.name, "uima.tcas.Annotation"): fs.begin = sofa._offset_converter.external_to_python(fs.begin) fs.end = sofa._offset_converter.external_to_python(fs.end) view.add(fs, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas