Beispiel #1
0
 def endElement(self, name):
     SAX2DOM.endElement(self, name)
     if name == 'release':
         release = self.document.getElementsByTagName('release')[0]
         self.restart()
         #print release.toprettyxml('    ').encode('utf8')
         id = int(release.getAttribute('id'))
         sys.stderr.write('%d\r' % id)
         if id not in self._discogs_ids:
             return
         catnos = []
         labels = []
         for node in release.getElementsByTagName('label'):
             catnos.append(node.getAttribute('catno'))
             labels.append(node.getAttribute('name'))
         nodes = release.getElementsByTagName('country')
         country = get_text(nodes[0]) if nodes else ''
         nodes = release.getElementsByTagName('released')
         date = get_text(nodes[0]) if nodes else ''
         formats = []
         for node in release.getElementsByTagName('format'):
             formats.append(node.getAttribute('name'))
         line = '%s\t%s\t%s\t%s\t%s\t%s' % (id, ';'.join(catnos),
                                            ';'.join(labels), country, date,
                                            ';'.join(set(formats)))
         print line.encode('utf-8')
Beispiel #2
0
def fuck_dom(page):
    page = UnicodeDammit(page).unicode_markup
    tree = etree.fromstring(page,etree.HTMLParser())
    #tree.docinfo.encoding = "utf-8"
    handler = SAX2DOM()
    sax.saxify(tree, handler)
    return handler.document
Beispiel #3
0
def parse_lxml_dom(xml, strict_xml=True):
    if strict_xml:
        parse_func = lxml.etree.fromstring
    else:
        parse_func = lxml.html.document_fromstring
    try:
        tree = parse_func(xml)
    except lxml.etree.XMLSyntaxError:
        tree = parse_func('<body>%s</body>' % xml)

    handler = SAX2DOM()
    lxml.sax.saxify(tree, handler)
    return handler.document
 def endElement(self, name):
     SAX2DOM.endElement(self, name)
     if name == 'release':
         release = self.document.getElementsByTagName('release')[0]
         self.restart()
         #print release.toprettyxml('    ').encode('utf8')
         id = int(release.getAttribute('id'))
         sys.stderr.write('%d\r' % id)
         if id not in self._discogs_ids:
             return
         catnos = []
         labels = []
         for node in release.getElementsByTagName('label'):
             catnos.append(node.getAttribute('catno'))
             labels.append(node.getAttribute('name'))
         nodes = release.getElementsByTagName('country')
         country = get_text(nodes[0]) if nodes else ''
         nodes = release.getElementsByTagName('released')
         date = get_text(nodes[0]) if nodes else ''
         formats = []
         for node in release.getElementsByTagName('format'):
             formats.append(node.getAttribute('name'))
         line = '%s\t%s\t%s\t%s\t%s\t%s' % (id, ';'.join(catnos), ';'.join(labels), country, date, ';'.join(set(formats)))
         print line.encode('utf-8')
    def loadFromFile(cls, filename):
        path_file = os.path.abspath(filename)
        if not os.path.isfile(path_file):
            err = "Error: '%s' does not exist or is not a file." % filename
            print(err)
            raise Exception(err)

        # Note: parsing a file directly with dexml/minidom is supposedly slower, si I used lxml one, 
        #       but I did not benchmark it.
        tree = etree.parse(path_file)
        handler = SAX2DOM()
        sax.saxify(tree, handler)
        dom = handler.document

        # In case, you can pass the filename to parse() here to skip lxml
        mdl = cls.parse(dom)
        return mdl
Beispiel #6
0
def parse_lxml_dom(tree):
    handler = SAX2DOM()
    lxml.sax.saxify(tree, handler)
    return handler.document
Beispiel #7
0
 def endElement(self, name):
     self._locationStack.pop()
     SAX2DOM.endElement(self, name)
Beispiel #8
0
 def startElement(self, name, attrs):
     self._locationStack.append((name, self._docLocator.getLineNumber(),
                                 self._docLocator.getColumnNumber()))
     SAX2DOM.startElement(self, name, attrs)
Beispiel #9
0
 def setDocumentLocator(self, locator):
     self._docLocator = locator
     SAX2DOM.setDocumentLocator(self, locator)
Beispiel #10
0
 def __init__(self):
     SAX2DOM.__init__(self)
     self._locationStack = []
Beispiel #11
0
 def endElement(self, name):
     self._locationStack.pop()
     SAX2DOM.endElement(self, name)
Beispiel #12
0
 def startElement(self, name, attrs):
     self._locationStack.append((name, self._docLocator.getLineNumber(), self._docLocator.getColumnNumber()))
     SAX2DOM.startElement(self, name, attrs)
Beispiel #13
0
 def setDocumentLocator(self, locator):
     self._docLocator = locator
     SAX2DOM.setDocumentLocator(self, locator)
Beispiel #14
0
 def __init__(self):
     SAX2DOM.__init__(self)
     self._locationStack = []
Beispiel #15
0
 def __init__(self, discogs_ids):
     SAX2DOM.__init__(self)
     self._discogs_ids = discogs_ids
Beispiel #16
0
 def __init__(self, discogs_ids):
     SAX2DOM.__init__(self)
     self._discogs_ids = discogs_ids