def process_xml(self, packet): while not self.context is None: #while not packet.is_end_of_doc(): try: event, elem = self.context.next() except (etree.XMLSyntaxError, StopIteration): # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701 self.context = None if self.context is None: # Always end of doc # TODO: is this still useful for a non-input component? packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": # TODO check if deepcopy is the right thing to do here. # packet.data = elem pass # self.root.remove(elem) elif event == "end": # Delete the element from the tree # self.root.clear() packet.data = elem self.elem_count += 1 self.root.remove(elem) if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() # If there is a next component, let it process if self.next: # Hand-over data (line, doc whatever) to the next component packet.format = self._output_format packet = self.next.process(packet) return packet
def process_xml(self, packet): while self.context is not None: # while not packet.is_end_of_doc(): try: event, elem = next(self.context) except (etree.XMLSyntaxError, StopIteration): # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701 self.context = None if self.context is None: # Always end of doc # TODO: is this still useful for a non-input component? packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": pass elif event == "end": packet.data = deepcopy(elem) self.elem_count += 1 if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() # Clear the element which has been read. Don't clear the root document, # since the last element hasn't been processed yet. elem.clear() # If there is a next component, let it process if self.next: # Hand-over data (line, doc whatever) to the next component packet.format = self._output_format packet = self.next.process(packet) return packet
def read(self, packet): event = None packet.data = None if self.context is None: if not len(self.file_list): # No more files left, all done log.info("No more files left") return packet # Files available: pop next file self.cur_file_path = self.file_list.pop(0) fd = open(self.cur_file_path) self.elem_count = 0 log.info("file opened : %s" % self.cur_file_path) self.context = etree.iterparse(fd, events=("start", "end")) self.context = iter(self.context) event, self.root = self.context.next() try: event, elem = self.context.next() except (etree.XMLSyntaxError, StopIteration): # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701 self.context = None if self.context is None: # Always end of doc packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) # Maybe end of stream (all docs done) if not len(self.file_list): # No more files left: end of stream packet.set_end_of_stream() log.info("End of stream") return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": # TODO check if deepcopy is the right thing to do here. # packet.data = elem pass # self.root.remove(elem) elif event == "end": # Delete the element from the tree # self.root.clear() packet.data = elem self.elem_count += 1 self.root.remove(elem) if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() return packet
class XmlElementStreamerFileInput(FileInput): """ Extracts XML elements from a file, outputs each feature element in Packet. Parsing is streaming (no internal DOM buildup) so any file size can be handled. Use this class for your big GML files! produces=FORMAT.etree_element_stream """ # Constructor def __init__(self, configdict, section): FileInput.__init__(self, configdict, section, produces=FORMAT.etree_element_stream) self.element_tags = self.cfg.get('element_tags').split(',') self.file_list_done = [] self.strip_namespaces = self.cfg.get('strip_namespaces', False) self.context = None self.root = None self.cur_file_path = None self.elem_count = 0 def read(self, packet): event = None packet.data = None if self.context is None: if not len(self.file_list): # No more files left, all done log.info("No more files left") return packet # Files available: pop next file self.cur_file_path = self.file_list.pop(0) file = open(self.cur_file_path) self.elem_count = 0 log.info("file opened : %s" % self.cur_file_path) self.context = etree.iterparse(file, events=("start", "end")) self.context = iter(self.context) event, self.root = self.context.next() try: event, elem = self.context.next() except StopIteration, e: self.context = None if self.context is None: # Always end of doc packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) # Maybe end of stream (all docs done) if not len(self.file_list): # No more files left: end of stream packet.set_end_of_stream() log.info("End of stream") return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": # TODO check if deepcopy is the right thing to do here. # packet.data = elem pass # self.root.remove(elem) elif event == "end": # Delete the element from the tree # self.root.clear() packet.data = elem self.elem_count += 1 self.root.remove(elem) if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() return packet