def test_parse(self): """Minimal test of DOMEventStream.parse()""" handler = pulldom.parse(tstfile) self.addCleanup(handler.stream.close) list(handler) with open(tstfile, 'rb') as fin: list(pulldom.parse(fin))
def read(self): events = pulldom.parse(self.filename) events = pulldom.parse(sys.argv[1]) for (event, node) in events: if event == "START_ELEMENT" and node.tagName == "entry": events.expandNode(node) entry = Entry(node) yield entry
def test_parse(self): """Minimal test of DOMEventStream.parse()""" # This just tests that parsing from a stream works. Actual parser # semantics are tested using parseString with a more focused XML # fragment. # Test with a filename: handler = pulldom.parse(tstfile) self.addCleanup(handler.stream.close) list(handler) # Test with a file object: with open(tstfile, "rb") as fin: list(pulldom.parse(fin))
def PullParse(fileName, katalog, baseName): db = XmlDB(katalog + "/" + baseName) db.Open() events = pulldom.parse(fileName) pre = 1 stack = [] stack.append(XmlNodeTuple(0, type="elem", data="doc")) for (event, node) in events: if event == "START_ELEMENT": stack.append(XmlNodeTuple(pre, type="elem", data=node.tagName)) pre += 1 if event == "END_ELEMENT": x = stack.pop() if len(stack) > 0: parent = stack[len(stack) - 1].pre else: parent = 0 db.AddTuple(x.pre, pre, parent, x.type, x.data) pre += 1 if event == "CHARACTERS" and node.data != "\n" and node.data != " ": db.AddTuple(pre, pre + 1, pre - 1, "text", node.data) pre += 2 x = stack.pop() parent = -1 db.AddTuple(x.pre, pre, parent, x.type, x.data) return db
def process_file(path): global page_count, blp_count, found doc = pulldom.parse(bz2.open(path)) for event, node in doc: if event == pulldom.START_ELEMENT and node.tagName == 'page': page_count += 1 if page_count % 1000 == 0: dt = datetime.now() - t0 print(f'Done with {humanize.intcomma(page_count)} pages, {humanize.intcomma(blp_count)} blps, found {found} in {dt}') doc.expandNode(node) ns = node.getElementsByTagName('ns')[0].childNodes[0].nodeValue title = node.getElementsByTagName('title')[0].childNodes[0].nodeValue if not ns == '0': continue cdataNodes = node.getElementsByTagName('text')[0].childNodes content = ' '.join(node.nodeValue for node in cdataNodes).lower() if '#redirect' in content: continue if 'living people' in content: blp_count += 1 if not 'ref' in content: found += 1 print('Found:', title)
def read_xml_or_gz_file(input_file_path): """ Reads a file in xml format and returns its content. If the file is zipped, it unzips it first :param input_file_path: :return: """ input_file_extension = os.path.splitext(input_file_path)[1] # Read biosamples from XML file if input_file_extension == '.gz': content = pulldom.parse(gzip.open(input_file_path)) elif input_file_extension == '.xml': content = pulldom.parse(input_file_path) else: print('Error: invalid file extension') sys.exit(1) return content
def __init__(self, gmlFile): self.gmlFile = gmlFile _gml = open(gmlFile,'rb') self.gml = _gml self._filesize = float(os.fstat(_gml.fileno()).st_size) self.events = pulldom.parse(_gml) self.numFeatures = 0
def process(self, ctx, m): path = ctx.interpolate(ctx, self.path) logger.debug("Reading XML in pull mode (splitting by tag '%s'): %s" % (self.tagname, path)) with open(path, "r") as xmlfile: doc = pulldom.parse(xmlfile) for event, node in doc: if event == pulldom.START_ELEMENT and node.tagName == self.tagname: doc.expandNode(node) m2 = ctx.copy_message(m) xmltext = node.toxml().encode('utf-8') xmltext = "<root>" + xmltext + "</root>" parser = etree.XMLParser(recover=True, encoding="utf-8") xml = etree.fromstring(xmltext, parser=parser) for elem in xml.iter(): if ":" in elem.tag: elem.tag = ":".join(elem.tag.split(":")[1:]) m2['xml'] = xml yield m2
def handle_children(xmlfile, handle_parsenode): root_open = None root_close = None level = 0 xml_doc = pulldom.parse(xmlfile) for event, parsenode in xml_doc: if event == pulldom.START_ELEMENT: # print level, parsenode.getAttribute(ID_ATTR) if level == 0: root_open = parsenode.toprettyxml(indent="") # since we did not expand root_open contains the closing slash root_open = root_open[:-3] + ">\n" # change the schema for edge diffs root_open = root_open.replace("edges_file.xsd", "edgediff_file.xsd") root_close = "</%s>\n" % parsenode.localName if level == 1: # consumes END_ELEMENT, no level increase xml_doc.expandNode(parsenode) handle_parsenode(parsenode) else: level += 1 elif event == pulldom.END_ELEMENT: level -= 1 return root_open, root_close
def analyze_document(filename: str = "data/dblp.xml", expected_event_count: T.Optional[int] = 248393285): """ New function for dblp xml analysis used to correct my database schema. :param filename: :param expected_event_count: :return: """ os.chdir( os.path.dirname(filename) ) # so that relative reference to dtd file can be read by XML parser doc = pulldom.parse(filename, parser=parser, bufsize=2**14) for event, node in tqdm(doc, total=expected_event_count): if event == pulldom.START_ELEMENT: if node.tagName == "note": for k, v in node.attributes.items(): if k == 'type': note_types.add(v) elif node.tagName == "ee": for k, v in node.attributes.items(): if k == 'type': ee_types.add(v) elif node.tagName == "url": for k, v in node.attributes.items(): if k == 'type': url_types.add(v) elif node.tagName == "isbn": for k, v in node.attributes.items(): if k == 'type': isbn_types.add(v) # doc.expandNode(node) for s in [note_types, ee_types, url_types, isbn_types]: print(list(s))
def parse(xmlfile, element_names, element_attrs={}, attr_conversions={}): """ Parses the given element_names from xmlfile and yield compound objects for their xml subtrees (no extra objects are returned if element_names appear in the subtree) The compound objects provide all element attributes of the root of the subtree as attributes unless attr_names are supplied. In this case attr_names maps element names to a list of attributes which are supplied. If attr_conversions is not empty it must map attribute names to callables which will be called upon the attribute value before storing under the attribute name. The compound objects gives dictionary style access to list of compound objects o for any children with the given element name o['child_element_name'] = [osub0, osub1, ...] As a shorthand, attribute style access to the list of child elements is provided unless an attribute with the same name as the child elements exists (i.e. o.child_element_name = [osub0, osub1, ...]) @Note: All elements with the same name must have the same type regardless of the subtree in which they occur @Note: Attribute names may be modified to avoid name clashes with python keywords. @Example: parse('plain.edg.xml', ['edge']) """ elementTypes = {} xml_doc = pulldom.parse(xmlfile) for event, parsenode in xml_doc: if event == pulldom.START_ELEMENT and parsenode.localName in element_names: xml_doc.expandNode(parsenode) yield _get_compound_object(parsenode, elementTypes, parsenode.localName, element_attrs, attr_conversions)
def parse (source): """parse(source) : Pattern Parses the XML from the input stream and returns a Pattern tree. """ stream = util.DOMTokenStream(pulldom.parse(source)) element_map = {} # for item in stream: # print item, stream.parents # Process the document prologue and the first two elements assert stream.next()[0] == pulldom.START_DOCUMENT stream.expect_element('grammar') stream.expect_element('start') # Parse the main pattern body pattern = parse_top(stream) stream.expect_end_element('start') # Process definition section while 1: event, node = stream.get_next_event() if event == pulldom.END_ELEMENT and node.localName == 'grammar': # We're done break elif event == pulldom.START_ELEMENT and node.localName == 'define': # Parse definition ncname = node.getAttributeNS(RNG.BASE, 'name') stream.expect_element('element') nc = parse_nameclass(stream) pattern = parse_top(stream) stream.expect_end_element('element') stream.expect_end_element('define') element_map[ncname] = relaxng.Element(nc=nc, p1=pattern) else: raise RuntimeError, 'Unexpected event: %r, %r' % (event, node) # Loop through all the patterns, replacing Ref instances # with the corresponding Element instance # XXX does this always terminate, given that there can be # cycles of Elements? # XXX on the other hand, does this cover every single pattern # node that could contain a Ref instance? queue = [pattern] + element_map.values() while len(queue): head = queue.pop() if hasattr(head, 'p1'): if isinstance(head.p1, Ref): head.p1 = element_map[head.p1.ref_name] else: queue.append(head.p1) if hasattr(head, 'p2'): if isinstance(head.p2, Ref): head.p2 = element_map[head.p2.ref_name] else: queue.append(head.p2) return relaxng.Schema(pattern)
def parse(xmlfile, element_names, element_attrs={}, attr_conversions={}): """ Parses the given element_names from xmlfile and yield compound objects for their xml subtrees (no extra objects are returned if element_names appear in the subtree) The compound objects provide all element attributes of the root of the subtree as attributes unless attr_names are supplied. In this case attr_names maps element names to a list of attributes which are supplied. If attr_conversions is not empty it must map attribute names to callables which will be called upon the attribute value before storing under the attribute name. The compound objects gives dictionary style access to list of compound objects o for any children with the given element name o['child_element_name'] = [osub0, osub1, ...] As a shorthand, attribute style access to the list of child elements is provided unless an attribute with the same name as the child elements exists (i.e. o.child_element_name = [osub0, osub1, ...]) @Note: All elements with the same name must have the same type regardless of the subtree in which they occur @Note: Attribute names may be modified to avoid name clashes with python keywords. @Note: The element_names may be either a single string or a list of strings. @Example: parse('plain.edg.xml', ['edge']) """ if isinstance(element_names, str): element_names = [element_names] elementTypes = {} xml_doc = pulldom.parse(xmlfile) for event, parsenode in xml_doc: if event == pulldom.START_ELEMENT and parsenode.localName in element_names: xml_doc.expandNode(parsenode) yield _get_compound_object(parsenode, elementTypes, parsenode.localName, element_attrs, attr_conversions)
def extract(input_xml): """Process entire input XML document, firing on events""" # Start pulling; it continues automatically doc = pulldom.parse(input_xml) output = '' for event, node in doc: # elements to ignore: xml if event == pulldom.START_ELEMENT and node.localName in ignore: continue # copy comments intact elif event == pulldom.COMMENT: doc.expandNode(node) output += node.toxml() # empty inline elements: pb, milestone elif event == pulldom.START_ELEMENT and node.localName in inlineEmpty: output += node.toxml() # non-empty inline elements: note, hi, head, l, lg, div, p, ab, elif event == pulldom.START_ELEMENT and node.localName in inlineContent: output += regexEmptyTag.sub('>', node.toxml()) elif event == pulldom.END_ELEMENT and node.localName in inlineContent: output += '</' + node.localName + '>' elif event == pulldom.START_ELEMENT and node.localName in blockElement: output += '\n<' + node.localName + '>\n' elif event == pulldom.END_ELEMENT and node.localName in blockElement: output += '\n</' + node.localName + '>' elif event == pulldom.CHARACTERS: output += normalizeSpace(node.data) else: continue return output
def pulldom(self, filename, element_name): """ Return an iterator over dictionaries elements that match `element_name`. This uses the pulldom parser, so it's more memory efficient for large xml files. """ from xml.dom import pulldom from xml.dom import Node fh = open(filename, 'r') events = pulldom.parse(fh) for event in events: node_type, node = event if node_type == 'START_ELEMENT' and node.nodeName == element_name: events.expandNode(node) node.normalize() record = {} # TODO: This only parses a flat list of elements. It should # probably handle attributes or nested elements as well. for subnode in node.childNodes: if subnode.nodeType != Node.TEXT_NODE: if subnode.hasChildNodes(): record[subnode. nodeName] = subnode.firstChild.nodeValue else: record[subnode.nodeName] = '' yield record fh.close()
def sort_departs(routefilename, outfile): routes_doc = pulldom.parse(routefilename) vehicles = [] root = None for event, parsenode in routes_doc: if event == pulldom.START_ELEMENT: if root is None: root = parsenode.localName outfile.write("<%s>\n" % root) continue routes_doc.expandNode(parsenode) departAttr = DEPART_ATTRS.get(parsenode.localName) if departAttr is not None: startString = parsenode.getAttribute(departAttr) if ':' in startString: start = sumolib.miscutils.parseTime(startString) elif startString == "triggered": start = -1 # before everything else else: start = float(startString) vehicles.append( (start, parsenode.toprettyxml(indent="", newl=""))) else: # copy to output outfile.write(" " * 4 + parsenode.toprettyxml(indent="", newl="") + "\n") print('read %s elements.' % len(vehicles)) vehicles.sort(key=lambda v: v[0]) for depart, vehiclexml in vehicles: outfile.write(" " * 4) outfile.write(vehiclexml) outfile.write("\n") outfile.write("</%s>\n" % root) print('wrote %s elements.' % len(vehicles))
def parseXML(stream, parser=None): if isinstance(stream, six.string_types): events = pulldom.parseString(stream, parser) else: events = pulldom.parse(stream, parser) document = None chain = [] for event, node in events: if event == "START_DOCUMENT": chain.append(XMLNode("DOCUMENT", {})) elif event == "START_ELEMENT": node = XMLNode.fromDOMNode(node) if chain: chain[-1].children.append(node) chain.append(node) elif event == "END_ELEMENT": chain.pop(-1) elif event == "CHARACTERS": chain[-1].data += node.data elif event == "END_DOCUMENT": document = chain.pop(-1) return document or chain[0]
def process_metadata(f): count = 0 metadata = {} events = pulldom.parse(f) for event, node in events: if node.localName == 'gpxFile' and event == pulldom.START_ELEMENT: m = {} for k in ['visibility', 'user']: if node.hasAttribute(k): m[k] = node.getAttribute(k) for k in ['id', 'uid', 'points']: if node.hasAttribute(k): m[k] = int(node.getAttribute(k)) if node.hasAttribute('timestamp'): m['date'] = node.getAttribute('timestamp')[0:10] events.expandNode(node) desc = node.getElementsByTagName('description') if desc and desc[0].firstChild: m['description'] = desc[0].firstChild.data[0:500] tags = node.getElementsByTagName('tag') if tags: t = [] for tag in tags: if tag.firstChild: t.append(tag.firstChild.data) m['tags'] = t metadata[node.getAttribute('filename')] = m count += 1 if count % 10000 == 0: sys.stdout.write('.') sys.stdout.flush() return metadata
def handle_children(xmlfile, handle_parsenode): root = None schema = None version = "" level = 0 xml_doc = pulldom.parse(xmlfile) for event, parsenode in xml_doc: if event == pulldom.START_ELEMENT: # print level, parsenode.getAttribute(ID_ATTR) if level == 0: # since we did not expand root_open contains the closing slash root = parsenode.localName if root == "edges": schema = "edgediff_file.xsd" elif root == "tlLogics": schema = "tllogic_file.xsd" if parsenode.hasAttribute("version"): version = ' version="%s"' % parsenode.getAttribute( "version") if root not in ("edges", "nodes", "connections", "tlLogics"): # do not write schema information version = None if level == 1: # consumes END_ELEMENT, no level increase xml_doc.expandNode(parsenode) handle_parsenode(parsenode) else: level += 1 elif event == pulldom.END_ELEMENT: level -= 1 return root, schema, version
def parse(self, source, base_uri): """parse(source) : Pattern Parses the XML from the input stream and returns a Pattern tree. """ self.base_uri = base_uri stream = util.DOMTokenStream(pulldom.parse(source)) stream.set_legal_attributes(_legal_attributes) self.stream = stream # Process the document prologue assert stream.get_next_event()[0] == pulldom.START_DOCUMENT try: event, node = stream.get_next_event() except StopIteration: raise RuntimeError, ("No first element found -- " "missing RELAX NG namespace declaration?") assert is_start_of_pattern(event, node) root_grammar = stream.root_grammar root_grammar.add_start_sym("", self.parse_rest_of_pattern(event, node)) root_grammar.combine() pattern = root_grammar.start_symbol if pattern is relaxng.NotAllowed: raise RuntimeError, "Schema reduces to NotAllowed (can never be valid)" return relaxng.Schema(pattern)
def handle_children(xmlfile, handle_parsenode): root_open = None root_close = None level = 0 xml_doc = pulldom.parse(xmlfile) for event, parsenode in xml_doc: if event == pulldom.START_ELEMENT: # print level, parsenode.getAttribute(ID_ATTR) if level == 0: root_open = parsenode.toprettyxml(indent="") # since we did not expand root_open contains the closing slash root_open = root_open[:-3] + ">\n" # change the schema for edge diffs root_open = root_open.replace( "edges_file.xsd", "edgediff_file.xsd") root_close = "</%s>\n" % parsenode.localName if level == 1: # consumes END_ELEMENT, no level increase xml_doc.expandNode(parsenode) handle_parsenode(parsenode) else: level += 1 elif event == pulldom.END_ELEMENT: level -= 1 return root_open, root_close
def __init__(self, stream_or_string, **options): super(Deserializer, self).__init__(stream_or_string, **options) parser = sax.make_parser() parser.setFeature(sax.handler.feature_namespaces, 1) #parser.setFeature(sax.handler.feature_namespace_prefixes,1) self.event_stream = pulldom.parse(self.stream, parser) self.db = options.pop('using', DEFAULT_DB_ALIAS)
def sort_departs(routefilename, outfile): routes_doc = pulldom.parse(sys.argv[1]) vehicles = [] root = None for event, parsenode in routes_doc: if event == pulldom.START_ELEMENT: if root is None: root = parsenode.localName outfile.write("<%s>\n" % root) continue routes_doc.expandNode(parsenode) departAttr = DEPART_ATTRS.get(parsenode.localName) if departAttr is not None: start = float(parsenode.getAttribute(departAttr)) vehicles.append( (start, parsenode.toprettyxml(indent="", newl=""))) else: # copy to output outfile.write( " " * 4 + parsenode.toprettyxml(indent="", newl="") + "\n") print('read %s elements.' % len(vehicles)) vehicles.sort(key=lambda v: v[0]) for depart, vehiclexml in vehicles: outfile.write(" " * 4) outfile.write(vehiclexml) outfile.write("\n") outfile.write("</%s>\n" % root) print('wrote %s elements.' % len(vehicles))
def filter_samples(): if not os.path.exists(os.path.dirname(OUTPUT_FILE)): os.makedirs(os.path.dirname(OUTPUT_FILE)) print('Input file: ' + INPUT_FILE) print('Processing NCBI samples...') # Read biosamples from XML file content = pulldom.parse(gzip.open(INPUT_FILE)) processed_samples_count = 0 selected_samples_count = 0 with codecs.open(OUTPUT_FILE, 'w', 'utf-8') as f: f.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") f.write("<BioSampleSet>") for event, node in content: if event == 'START_ELEMENT' and node.tagName == 'BioSample': content.expandNode(node) node_xml = node.toxml() processed_samples_count = processed_samples_count + 1 if processed_samples_count % 5000 == 0: print('Processed samples: ' + str(processed_samples_count)) print('Selected samples: ' + str(selected_samples_count)) if is_homo_sapiens_sample(node_xml): if has_minimum_relevant_attributes_count( node_xml, constants.NCBI_FILTER_MIN_RELEVANT_ATTS): f.write('\n' + node.toxml()) selected_samples_count = selected_samples_count + 1 f.write("\n</BioSampleSet>\n") f.close() print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples selected: ' + str(selected_samples_count))
def call(self): """ Makes a request to cghub server. Returns generator that returns Result objects. """ self.patch_input_data() query = self.build_query() url = '%s%s' % (self.server_url, self.uri) if query: url = '%s?%s' % (url, query) xml = self.get_source_file(url) if self.format == self.FORMAT_JSON: results = ijson.items(xml, 'response.docs.item') for item in results: yield item else: # http://docs.python.org/dev/library/xml.dom.pulldom.html doc = pulldom.parse(xml) for event, node in doc: if event == pulldom.START_ELEMENT: if node.tagName == 'doc': doc.expandNode(node) # convert to python object # http://docs.python.org/2/library/xml.etree.elementtree.html result_xml = node.toxml(encoding='utf-8') tree = ElementTree.fromstring(result_xml) result = Result(tree) yield self.patch_result(result, result_xml) elif node.tagName == 'result': self.hits = int(node.getAttribute('numFound'))
def _parsePage(self, xml, pageResults, translationLanguage, includeSentences): events = pulldom.parse(xml) changedItemCount = 0 for (event, node) in events: if event == pulldom.START_ELEMENT: if node.tagName.lower() == "list": events.expandNode(node) smartfmlist = SmartFMList() smartfmlist.loadFromDOM(node) # self._logMsg(smartfmlist) if pageResults.addList(smartfmlist): changedItemCount += 1 elif node.tagName.lower() == "item": events.expandNode(node) smartfmitem = SmartFMVocab() smartfmitem.loadFromDOM(node) # self._logMsg(smartfmitem) if pageResults.addItem(smartfmitem): changedItemCount += 1 if includeSentences: for sentence in smartfmitem.sentencesFromDOM(node, translationLanguage): # self._logMsg(sentence) if pageResults.addItem(sentence): changedItemCount += 1 elif sentence.uniqIdStr() in pageResults.items: pageResults.items[sentence.uniqIdStr()].linkToVocab(smartfmitem) pageResults.updateIndexToMatch(sentence.uniqIdStr(), smartfmitem.uniqIdStr()) elif includeSentences and node.tagName.lower() == "sentence": events.expandNode(node) smartfmsentence = SmartFMSentence() smartfmsentence.loadFromDOM(node, translationLanguage) if pageResults.addItem(smartfmsentence): changedItemCount += 1 return changedItemCount
def load_chunks(fpath, limit=None): with open(fpath, 'rb') as f: events = pulldom.parse(f, parser=_create_parser()) chunk_id = 0 chunk_events = _start_events(events, 'chunk') for chunk in tqdm(chunk_events, desc=f'Loading chunks from {fpath}'): for chunk in _findall(chunk, 'chunk'): if chunk_id == limit: return chunk_id += 1 tokens = [] for tok in _findall(chunk, 'tok'): orth = _findvalue(tok, 'orth') lemmas = [] ctags = [] disamb_lemma = None disamb_ctag = None for lex in _findall(tok, 'lex'): lemma = _findvalue(lex, 'base') ctag = _findvalue(lex, 'ctag') if lex.getAttribute('disamb') == '1': disamb_lemma = lemma disamb_ctag = ctag else: lemmas.append(lemma) ctags.append(ctag) token = Token(orth, lemmas, ctags, disamb_lemma, disamb_ctag) tokens.append(token) yield Chunk(tokens)
def __init__(self, handle, namespace=None): """Create the object and initialize the XML parser.""" self.source = None self.source_version = None self.version = None self.speciesName = None self.ncbiTaxID = None self._namespace = namespace # pulldom.parse can accept both file handles and file names. # However, it doesn't use a context manager. so if we provide a file # name and let pulldom.parse open the file for us, then the file # will remain open until SeqXmlIterator is deallocated or we delete # the DOMEventStream returned by pulldom.parse. # Delete the DOMEventStream in case any exceptions happen. self._events = pulldom.parse(handle) try: try: event, node = next(self._events) except StopIteration: raise_from(ValueError("Empty file."), None) if event != "START_DOCUMENT" or node.localName is not None: raise ValueError("Failed to find start of XML") self._read_header() except Exception: self._events = None raise
def parse(xmlfile, element_name): # parses the attributes of all nodes with element_name and returns a list of namedtuples # @note the first node in xmlfile will determine the set of attributes # @note attribute names which are also python keywords will be prefixed with 'attr_' elementType = [] # mutable, will be [namedtuple] xml_doc = pulldom.parse(xmlfile) return [get_attrs(parsenode, elementType, element_name) for event, parsenode in xml_doc if event == pulldom.START_ELEMENT and parsenode.localName == element_name]
def _initFromFile(self, path): kanjiDicFile = open(path) events = pulldom.parse(kanjiDicFile) for (event, node) in events: if event == pulldom.START_ELEMENT: if node.tagName.lower() == 'character': events.expandNode(node) self._processNode(node)
def __init__(self, stream): self._items = pulldom.parse(XMLStream(stream), bufsize = 256) self._item = None # The current item self._next = None # 1 item pushback buffer self.kind = None self.name = None self.attrs = None self.value = None
def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} tree = pulldom.parse(filename) for event, node in tree: if event == pulldom.START_ELEMENT: keys = key_type(node, keys) return keys
def testXInclude(self): file = "../../xml_files_windows/xinclude.xml" tagName = "data" doc = _PULLDOM.parse(file) for event, node in doc: if event == _PULLDOM.START_ELEMENT and node.tagName == tagName: doc.expandNode(node) self.assertEqual("xi:include", node.firstChild.nodeName)
def dmoz_reader(filename): doc = pulldom.parse(filename) for event, node in doc: if event == pulldom.START_ELEMENT and node.tagName == 'ExternalPage': doc.expandNode(node) url = node.attributes['about'].value topic_node = node.getElementsByTagName('topic')[0] topics = topic_node.childNodes[0].data yield url, topics
def process_map(filename): keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0} tree=pulldom.parse(filename) for event, node in tree: if event==pulldom.START_ELEMENT: keys = key_type(node, keys) return keys
def get_wk_nodes(): events = pulldom.parse(sys.stdin) try: for (event, node) in events: if event == pulldom.START_ELEMENT and node.tagName == "page": events.expandNode(node) yield node except Exception as e: sys.stderr.write(str(e)+"\n")
def testXSLT(self): file = "../../xml_files_windows/optional/xslt.xsl" tagName = "xsl:stylesheet" doc = _PULLDOM.parse(file) for event, node in doc: if event == _PULLDOM.START_ELEMENT and node.tagName == tagName: doc.expandNode(node) self.assertEqual("xsl:stylesheet", node.nodeName)
def WalkNodesForAttributes(path): """Parse the xml file getting all attributes. <venue> <attribute>value</attribute> </venue> Returns: type_name - The java-style name the top node will have. "Venue" top_node_name - unadultured name of the xml stanza, probably the type of java class we're creating. "venue" attributes - {'attribute': 'value'} """ doc = pulldom.parse(path) type_name = None top_node_name = None attributes = {} level = 0 for event, node in doc: # For skipping parts of a tree. if level > 0: if event == pulldom.END_ELEMENT: level-=1 logging.warn('(%s) Skip end: %s' % (str(level), node)) continue elif event == pulldom.START_ELEMENT: logging.warn('(%s) Skipping: %s' % (str(level), node)) level+=1 continue if event == pulldom.START_ELEMENT: logging.warn('Parsing: ' + node.tagName) # Get the type name to use. if type_name is None: type_name = ''.join([word.capitalize() for word in node.tagName.split('_')]) top_node_name = node.tagName logging.warn('Found Top Node Name: ' + top_node_name) continue typ = node.getAttribute('type') child = node.getAttribute('child') # We don't want to walk complex types. if typ in COMPLEX: logging.warn('Found Complex: ' + node.tagName) level = 1 elif typ not in TYPES: logging.warn('Found String: ' + typ) typ = STRING else: logging.warn('Found Type: ' + typ) logging.warn('Adding: ' + str((node, typ))) attributes.setdefault(node.tagName, (typ, [child])) logging.warn('Attr: ' + str((type_name, top_node_name, attributes))) return type_name, top_node_name, attributes
def WalkNodesForAttributes(path): """Parse the xml file getting all attributes. <venue> <attribute>value</attribute> </venue> Returns: type_name - The java-style name the top node will have. "Venue" top_node_name - unadultured name of the xml stanza, probably the type of java class we're creating. "venue" attributes - {'attribute': 'value'} """ doc = pulldom.parse(path) type_name = None top_node_name = None attributes = {} level = 0 for event, node in doc: # For skipping parts of a tree. if level > 0: if event == pulldom.END_ELEMENT: level -= 1 logging.warn('(%s) Skip end: %s' % (str(level), node)) continue elif event == pulldom.START_ELEMENT: logging.warn('(%s) Skipping: %s' % (str(level), node)) level += 1 continue if event == pulldom.START_ELEMENT: logging.warn('Parsing: ' + node.tagName) # Get the type name to use. if type_name is None: type_name = ''.join( [word.capitalize() for word in node.tagName.split('_')]) top_node_name = node.tagName logging.warn('Found Top Node Name: ' + top_node_name) continue typ = node.getAttribute('type') child = node.getAttribute('child') # We don't want to walk complex types. if typ in COMPLEX: logging.warn('Found Complex: ' + node.tagName) level = 1 elif typ not in TYPES: logging.warn('Found String: ' + typ) typ = STRING else: logging.warn('Found Type: ' + typ) logging.warn('Adding: ' + str((node, typ))) attributes.setdefault(node.tagName, (typ, [child])) logging.warn('Attr: ' + str((type_name, top_node_name, attributes))) return type_name, top_node_name, attributes
def testTextNodes(self): text = [] for event, node in pulldom.parse(self.testFile): if event == pulldom.CHARACTERS: text.append(node.data) try: result = "".join(text) self.assertEqual(repr(result), r"u'\n Some greek: \u0391\u0392\u0393\u0394\u0395\n \n \n \n'") except Exception as x: self.fail("Unexpected exception joining text pieces: %s" % str(x))
def testComment(self): commentText = [] for event, node in pulldom.parse(self.testFile): if event == pulldom.COMMENT: commentText.append(node.data) try: result = "".join(commentText) self.assertEqual(repr(result), r"u'ΛΜΝΞΟ'") except Exception as x: self.fail("Unexpected exception joining comment data pieces: %s" % str(x))
def parse_stats(metrics=METRICS): for metric in metrics: for fn in glob.glob(STATS_XML % metric): try: dom = pulldom.parse(fn) except IOError: continue for event, node in dom: if event == "START_ELEMENT" and node.tagName == 'representative': yield web.storage(node.attributes.items())
def parse_fec(): dom = pulldom.parse(FEC_XML) for event, node in dom: if event == "START_ELEMENT" and node.tagName == 'candidate': dom.expandNode(node) fec_id = node.getElementsByTagName('id')[0].firstChild.nodeValue uri = node.getElementsByTagName('uri')[0].firstChild.nodeValue if fec_id in uri: continue bioguide_id = uri.split('/')[-1] yield {'fecid': fec_id, 'bioguideid': bioguide_id}
def testXXE(self): file = "../../xml_files_windows/xxe/xxe.xml" tagName = "data" doc = _PULLDOM.parse(file) for event, node in doc: if event == _PULLDOM.START_ELEMENT and node.tagName == tagName: doc.expandNode(node) self.assertEqual("data", node.nodeName) self.assertEqual("it_works", node.firstChild.data)
def testComment(self): commentText = [] for event, node in pulldom.parse(self.testFile): if event == pulldom.COMMENT: commentText.append(node.data) try: result = u"".join(commentText) self.failUnlessEqual(repr(result), r"u'ΛΜΝΞΟ'") except Exception, x: self.fail("Unexpected exception joining comment data pieces: %s" % str(x))
def testTextNodes(self): text = [] for event, node in pulldom.parse(self.testFile): if event == pulldom.CHARACTERS: text.append(node.data) try: result = u"".join(text) self.failUnlessEqual(repr(result), r"u'\n Some greek: \u0391\u0392\u0393\u0394\u0395\n \n \n \n'") except Exception, x: self.fail("Unexpected exception joining text pieces: %s" % str(x))
def testDefault_noAttack(self): file = "../../xml_files_windows/standard.xml" tagName = "data" doc = _PULLDOM.parse(file) for event, node in doc: if event == _PULLDOM.START_ELEMENT and node.tagName == tagName: doc.expandNode(node) self.assertEqual("data", node.nodeName) self.assertEqual("4", node.firstChild.data)
def processAntFile(filename): global currentFile handle = open(filename) doc = pulldom.parse(handle) resetCurrent() currentFile = filename for event, node in doc: if event == pulldom.START_ELEMENT: printNode(node) processNode(node)
def testParameterEntity_core(self): file = "../../xml_files_windows/xxep/parameterEntity_core.xml" tagName = "data" doc = _PULLDOM.parse(file) for event, node in doc: if event == _PULLDOM.START_ELEMENT and node.tagName == tagName: doc.expandNode(node) tmp = node.toxml() self.assertEqual("<data/>", tmp)
def __init__(self, stream_or_string, *, using=DEFAULT_DB_ALIAS, ignorenonexistent=False, **options): super().__init__(stream_or_string, **options) self.event_stream = pulldom.parse(self.stream, self._make_parser()) self.db = using self.ignore = ignorenonexistent
def testDOS_recursion(self): file = "../../xml_files_windows/dos/dos_recursion.xml" tagName = "data" with self.assertRaises(_SAX.SAXParseException): doc = _PULLDOM.parse(file) for event, node in doc: if event == _PULLDOM.START_ELEMENT and node.tagName == tagName: doc.expandNode(node) tmp = node.toxml()
def read(self): entry_id = 1 events = pulldom.parse(self.filename) # events = pulldom.parse(sys.argv[1]) for (event, node) in events: if event == "START_ELEMENT" and node.tagName == "entry": events.expandNode(node) entry = Entry(node, entry_id) yield entry entry_id += 1
def process_gpx(db, gpx_id, f, options): cur = db.cursor() geomfromtext = 'ST_GeomFromText(%s)' if options.reproject: geomfromtext = 'ST_Transform({0}, 900913)'.format(geomfromtext) segment = 0 needWrite = False events = pulldom.parse(f) for event, node in events: if event == pulldom.START_ELEMENT: if node.localName == 'trkseg': points = [] polledPoints = [] needWrite = False lastNode = None lastDate = None elif node.localName == 'trkpt': lat = float(node.getAttribute('lat')) lon = float(node.getAttribute('lon')) dist = abs(lon - lastNode[0]) + abs( lat - lastNode[1]) if lastNode else options.dmin * 2 events.expandNode(node) t = node.getElementsByTagName('time') time = t[0].firstChild.data lastNode = (lon, lat, time) if dist > options.dmax: needWrite = True polledPoints = [(lon, lat, time)] elif dist >= options.dmin: points.append((lon, lat, time)) if len(points) >= options.pmax: needWrite = True t = node.getElementsByTagName('time') if t and t[0].firstChild and len( t[0].firstChild.data) >= 10: lastDate = t[0].firstChild.data[0:10] elif event == pulldom.END_ELEMENT and node.localName == 'trkseg': needWrite = True if needWrite: if points and len(points) >= max(2, options.pmin): geom = 'SRID=4326;LINESTRINGM(' + ','.join( ['{0} {1} {2}'.format(x[0], x[1], x[2]) for x in points]) + ')' cur.execute( 'insert into gpx_data (gpx_id, segment_id, track_date, track) values (%s, %s, %s, {0})' .format(geomfromtext), (gpx_id, segment, lastDate, geom)) segment += 1 points = polledPoints polledPoints = [] needWrite = False cur.close()
def __init__(self, stream_or_string, **options): self.rawImportRecordBuffer = [] self.options = options if isinstance(stream_or_string, six.string_types): self.stream = six.StringIO(stream_or_string) else: self.stream = stream_or_string self.event_stream = pulldom.parse(self.stream, self._make_parser()) self.thin = options.pop('thin', 0)