def endElement(self, name): SAX2DOM.endElement(self, name) if name == 'release': release = self.document.getElementsByTagName('release')[0] self.restart() #print release.toprettyxml(' ').encode('utf8') id = int(release.getAttribute('id')) sys.stderr.write('%d\r' % id) if id not in self._discogs_ids: return catnos = [] labels = [] for node in release.getElementsByTagName('label'): catnos.append(node.getAttribute('catno')) labels.append(node.getAttribute('name')) nodes = release.getElementsByTagName('country') country = get_text(nodes[0]) if nodes else '' nodes = release.getElementsByTagName('released') date = get_text(nodes[0]) if nodes else '' formats = [] for node in release.getElementsByTagName('format'): formats.append(node.getAttribute('name')) line = '%s\t%s\t%s\t%s\t%s\t%s' % (id, ';'.join(catnos), ';'.join(labels), country, date, ';'.join(set(formats))) print line.encode('utf-8')
def fuck_dom(page): page = UnicodeDammit(page).unicode_markup tree = etree.fromstring(page,etree.HTMLParser()) #tree.docinfo.encoding = "utf-8" handler = SAX2DOM() sax.saxify(tree, handler) return handler.document
def parse_lxml_dom(xml, strict_xml=True): if strict_xml: parse_func = lxml.etree.fromstring else: parse_func = lxml.html.document_fromstring try: tree = parse_func(xml) except lxml.etree.XMLSyntaxError: tree = parse_func('<body>%s</body>' % xml) handler = SAX2DOM() lxml.sax.saxify(tree, handler) return handler.document
def loadFromFile(cls, filename): path_file = os.path.abspath(filename) if not os.path.isfile(path_file): err = "Error: '%s' does not exist or is not a file." % filename print(err) raise Exception(err) # Note: parsing a file directly with dexml/minidom is supposedly slower, si I used lxml one, # but I did not benchmark it. tree = etree.parse(path_file) handler = SAX2DOM() sax.saxify(tree, handler) dom = handler.document # In case, you can pass the filename to parse() here to skip lxml mdl = cls.parse(dom) return mdl
def parse_lxml_dom(tree): handler = SAX2DOM() lxml.sax.saxify(tree, handler) return handler.document
def endElement(self, name): self._locationStack.pop() SAX2DOM.endElement(self, name)
def startElement(self, name, attrs): self._locationStack.append((name, self._docLocator.getLineNumber(), self._docLocator.getColumnNumber())) SAX2DOM.startElement(self, name, attrs)
def setDocumentLocator(self, locator): self._docLocator = locator SAX2DOM.setDocumentLocator(self, locator)
def __init__(self): SAX2DOM.__init__(self) self._locationStack = []
def __init__(self, discogs_ids): SAX2DOM.__init__(self) self._discogs_ids = discogs_ids