def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) lenient_ids = set() context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS # If a type was not found, ignore it if lenient, else raise an exception try: fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs except TypeNotFoundError as e: if not lenient: raise e warnings.warn(e.message) xmiID = elem.attrib.get( "{http://www.omg.org/XMI}id", None) if xmiID: lenient_ids.add(int(xmiID)) children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values referenced_fs = set() for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": value = getattr(fs, feature_name) sofa = sofas[value] setattr(fs, feature_name, sofa) continue if (typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection( feature.rangeTypeName) or typesystem.is_primitive_collection(fs.type)): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(fs.type, feature): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) referenced_fs.add(target_id) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] referenced_fs.add(target_id) setattr(fs, feature_name, target) cas = Cas(typesystem=typesystem, lenient=lenient) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") # We need to make sure that the sofa gets the real xmi, see #155 view.get_sofa().xmiID = sofa.xmiID else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) for member_id in proto_view.members: # We ignore ids of feature structures for which we do not have a type if member_id in lenient_ids: continue fs = feature_structures[member_id] # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints if typesystem.is_instance_of(fs.type, "uima.tcas.Annotation"): try: fs.begin = sofa._offset_converter.uima_to_cassis( fs.begin) fs.end = sofa._offset_converter.uima_to_cassis(fs.end) except KeyError: pass view.add_annotation(fs, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem, lenient: bool, trusted: bool): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) lenient_ids = set() context = etree.iterparse(source, events=("start", "end"), huge_tree=trusted) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(typesystem, elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError(f"Invalid state transition: [{state}] 'start'") elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS # If a type was not found, ignore it if lenient, else raise an exception try: fs = self._parse_feature_structure(typesystem, elem, children) feature_structures[fs.xmiID] = fs except TypeNotFoundError as e: if not lenient: raise e warnings.warn(e.message) xmiID = elem.attrib.get("{http://www.omg.org/XMI}id", None) if xmiID: lenient_ids.add(int(xmiID)) children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError(f"Invalid state transition: [{state}] 'end'") else: raise RuntimeError(f"Invalid XML event: [{event}]") # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type.name) for feature in t.all_features: feature_name = feature.name value = fs[feature_name] if feature_name == "sofa": fs[feature_name] = sofas[value] continue if typesystem.is_instance_of(fs.type.name, TYPE_NAME_STRING_ARRAY): # We already parsed string arrays to a Python list of string # before, so we do not need to work more on this continue elif typesystem.is_primitive(feature.rangeType): fs[feature_name] = self._parse_primitive_value(feature.rangeType, value) continue elif typesystem.is_primitive_array(fs.type) and feature_name == "elements": # Separately rendered arrays (typically used with multipleReferencesAllowed = True) fs[feature_name] = self._parse_primitive_array(fs.type, value) elif typesystem.is_primitive_array(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, str): FSType = feature.rangeType fs[feature_name] = FSType(elements=self._parse_primitive_array(feature.rangeType, value)) elif typesystem.is_primitive_list(feature.rangeType) and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, str): fs[feature_name] = self._parse_primitive_list(feature.rangeType, value) else: # Resolve references here if value is None: continue # Resolve references if fs.type.name == TYPE_NAME_FS_ARRAY or ( feature.rangeType.name == TYPE_NAME_FS_ARRAY and not feature.multipleReferencesAllowed ): # An array of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) if feature.rangeType.name == TYPE_NAME_FS_ARRAY: # Wrap inline array into the appropriate array object ArrayType = typesystem.get_type(TYPE_NAME_FS_ARRAY) targets = ArrayType(elements=targets) fs[feature_name] = targets elif feature.rangeType.name == TYPE_NAME_FS_LIST and not feature.multipleReferencesAllowed: # Array feature rendered inline (multipleReferencesAllowed = False|None) # We also end up here for array features that were rendered as child elements. No need to parse # them again, so we check if the value is still a string (i.e. attribute value) and only then # process it if isinstance(value, list) or isinstance(value, str): fs[feature_name] = self._parse_fs_list(feature_structures, feature.rangeType, value) else: target_id = int(value) fs[feature_name] = feature_structures[target_id] cas = Cas(typesystem=typesystem, lenient=lenient) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") # We need to make sure that the sofa gets the real xmi, see #155 view.get_sofa().xmiID = sofa.xmiID else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) for member_id in proto_view.members: # We ignore ids of feature structures for which we do not have a type if member_id in lenient_ids: continue fs = feature_structures[member_id] # Map from offsets in UIMA UTF-16 based offsets to Unicode codepoints if typesystem.is_instance_of(fs.type.name, "uima.tcas.Annotation"): fs.begin = sofa._offset_converter.external_to_python(fs.begin) fs.end = sofa._offset_converter.external_to_python(fs.end) view.add(fs, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_NULL = NS_CAS + "NULL" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = [] views = {} feature_structures = {} children = defaultdict(list) context = etree.iterparse(source, events=("start", "end")) state = OUTSIDE_FS for event, elem in context: if elem.tag == TAG_XMI or elem.tag == TAG_CAS_NULL: pass # Ignore the 'xmi:XMI' and 'cas:NULL' elements elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas.append(sofa) elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) if len(sofas) != len(views): raise RuntimeError("Number of views and sofas is not equal!") # Post-process feature values for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": continue if typesystem.is_primitive( feature.rangeTypeName ) or typesystem.is_primitive_collection(feature.rangeTypeName): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(feature.rangeTypeName): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] setattr(fs, feature_name, target) cas = Cas(typesystem) for sofa in sofas: proto_view = views[sofa.xmiID] if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") else: view = cas.create_view(sofa.sofaID) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType for member_id in proto_view.members: annotation = feature_structures[member_id] view.add_annotation(annotation) return cas
def deserialize(self, source: Union[IO, str], typesystem: TypeSystem): # namespaces NS_XMI = "{http://www.omg.org/XMI}" NS_CAS = "{http:///uima/cas.ecore}" TAG_XMI = NS_XMI + "XMI" TAG_CAS_SOFA = NS_CAS + "Sofa" TAG_CAS_VIEW = NS_CAS + "View" OUTSIDE_FS = 1 INSIDE_FS = 2 INSIDE_ARRAY = 3 sofas = {} views = {} feature_structures = {} children = defaultdict(list) context = etree.iterparse(source, events=("start", "end")) state = OUTSIDE_FS self._max_xmi_id = 0 self._max_sofa_num = 0 for event, elem in context: # Ignore the 'xmi:XMI' if elem.tag == TAG_XMI: pass elif elem.tag == TAG_CAS_SOFA: if event == "end": sofa = self._parse_sofa(elem) sofas[sofa.xmiID] = sofa elif elem.tag == TAG_CAS_VIEW: if event == "end": proto_view = self._parse_view(elem) views[proto_view.sofa] = proto_view else: """ In XMI, array element features can be encoded as <cas:StringArray> <elements>LNC</elements> <elements>MTH</elements> <elements>SNOMEDCT_US</elements> </cas:StringArray> In order to parse this with an incremental XML parser, we need to employ a simple state machine. It is depicted in the following. "start" "start" +-----------+-------->+-----------+-------->+--------+ | Outside | | Inside | | Inside | +--->+ feature | | feature | | array | | structure | | structure | | element| +-----------+<--------+-----------+<--------+--------+ "end" "end" """ if event == "start": if state == OUTSIDE_FS: # We saw the opening tag of a new feature structure state = INSIDE_FS elif state == INSIDE_FS: # We saw the opening tag of an array element state = INSIDE_ARRAY else: raise RuntimeError( "Invalid state transition: [{0}] 'start'".format( state)) elif event == "end": if state == INSIDE_FS: # We saw the closing tag of a new feature state = OUTSIDE_FS fs = self._parse_feature_structure( typesystem, elem, children) feature_structures[fs.xmiID] = fs children.clear() elif state == INSIDE_ARRAY: # We saw the closing tag of an array element children[elem.tag].append(elem.text) state = INSIDE_FS else: raise RuntimeError( "Invalid state transition: [{0}] 'end'".format( state)) else: raise RuntimeError( "Invalid XML event: [{0}]".format(event)) # Free already processed elements from memory if event == "end": self._clear_elem(elem) # Post-process feature values referenced_fs = set() for xmi_id, fs in feature_structures.items(): t = typesystem.get_type(fs.type) for feature in t.all_features: feature_name = feature.name if feature_name == "sofa": value = getattr(fs, feature_name) sofa = sofas[value] setattr(fs, feature_name, sofa) continue if (typesystem.is_primitive(feature.rangeTypeName) or typesystem.is_primitive_collection( feature.rangeTypeName) or typesystem.is_primitive_collection(fs.type)): # TODO: Parse feature values to their real type here, e.g. parse ints or floats continue # Resolve references here value = getattr(fs, feature_name) if value is None: continue # Resolve references if typesystem.is_collection(fs.type, feature): # A collection of references is a list of integers separated # by single spaces, e.g. <foo:bar elements="1 2 3 42" /> targets = [] for ref in value.split(): target_id = int(ref) target = feature_structures[target_id] targets.append(target) referenced_fs.add(target_id) setattr(fs, feature_name, targets) else: target_id = int(value) target = feature_structures[target_id] referenced_fs.add(target_id) setattr(fs, feature_name, target) cas = Cas(typesystem=typesystem) for sofa in sofas.values(): if sofa.sofaID == "_InitialView": view = cas.get_view("_InitialView") else: view = cas.create_view(sofa.sofaID, xmiID=sofa.xmiID, sofaNum=sofa.sofaNum) view.sofa_string = sofa.sofaString view.sofa_mime = sofa.mimeType # If a sofa has no members, then UIMA might omit the view. In that case, # we create an empty view for it. if sofa.xmiID in views: proto_view = views[sofa.xmiID] else: proto_view = ProtoView(sofa.xmiID) # Patch: Rewrite offsets if view.sofa_string is not None: mapper = JavaOffsetsMapper(view.sofa_string) else: mapper = None # End patch for member_id in proto_view.members: annotation = feature_structures[member_id] # Patch: Rewrite offsets if mapper and \ hasattr(annotation, "begin") and\ hasattr(annotation, "end") and\ annotation.begin is not None and\ annotation.end is not None: annotation.begin = mapper.java_to_python_begin( annotation.begin) try: annotation.end = mapper.java_to_python_end( annotation.end) except Exception as e: print("\nYYYYYYYYYY", annotation.end) print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx", mapper.text) # End patch view.add_annotation(annotation, keep_id=True) cas._xmi_id_generator = IdGenerator(self._max_xmi_id + 1) cas._sofa_num_generator = IdGenerator(self._max_sofa_num + 1) return cas