class HtmlDom: def __init__(self, url): try: f = file(url) data = f.read() f.close() except IOError, e: try: result = fetch(url, agent=MOZILLA_AGENT) data = result['data'] except: raise IOError, 'invalid URL' # create parser parser = tidy.TreeBuilder() parser.feed(data) xmlText = _etree.tostring(parser.close()) #create the DOM reader = PyExpat.Reader() self.dom = reader.fromString(xmlText) self.nss = {u'html': XHTML_NAMESPACE} self.context = xml.xpath.Context.Context(self.dom, processorNss=self.nss)