def parseFile(filename, html=True): flush_print("Parsing: %s" % filename) fileHandle = open(filename) fileDoc = libxml2dom.parse(fileHandle) fileHandle.close() #flush_print("Parsed: %s" % filename) return fileDoc
def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, impl=None): return libxml2dom.parse(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=(impl or default_impl))
def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, impl=None): doc = libxml2dom.parse(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=(impl or default_impl)) initialiseEvents(doc) return doc
def create_proof(url, parse=None, out=None): """ Create a HTML document with new attributes to provide a proof to benchmarks and some other functions executed by this framework. The basic annotations are: 'proof_productlist' and 'proof_product'. """ if not parse: parse = Path() # use libxml2 to parse the HTML document doc = libxml2dom.parse('%s' %url, html=1, unfinished=1, htmlencoding='latin1') L = [] # check if the parse can find a list of products productList = parse.plist(doc) if len(productList) == 1: node = productList[0] if debug: print 'found a product list!', node.localName elif debug and len(productList) > 1: if debug: print 'found more then one product list!!!', node.localName else: print '\nERROR: Cannot found a list of products using corrent xpath' return products = parse.products(doc) lastline = parse.last(doc) if lastline != None: [products.append(p) for p in lastline] if debug: print len(products) for pl in productList: pl.createAttribute('proof_productlist') pl.setAttribute('proof_productlist', 'true') for product in products: product.createAttribute('proof_product') product.setAttribute('proof_product', 'true') if out != None: print >>out, doc.toString()
def getEd2kLinks(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url='http://secure.verycd.com/signin/*/http://www.verycd.com/', headers=headers) webPage = urllib2.urlopen(url) try: doc = libxml2dom.parse(webPage, 1, 'utf-8') finally: webPage.close() return [getEd2kLinkFromDownloadBtn(doc), getEd2kLinkFromSubtitle(doc)]
def getEd2kLinks( url ): headers = { 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/', headers = headers ) webPage = urllib2.urlopen(url) try: doc = libxml2dom.parse(webPage, 1, 'utf-8') finally: webPage.close() return [ getEd2kLinkFromDownloadBtn(doc), getEd2kLinkFromSubtitle(doc)]
def handleAsk(request): formData = {"roomsFr" : request.POST.getlist('roomsFr')[0], "roomsTo" : request.POST.getlist('roomsTo')[0], "priceFr" : request.POST.getlist('priceFr')[0], "priceTo" : request.POST.getlist('priceTo')[0], "metros": "&".join([ "metro%5B%5D={0}".format(code) for code in request.POST.getlist('stations')]) } url = 'http://www.bn.ru/zap_fl.phtml?kkv1={roomsFr}&kkv2={roomsTo}&price1={priceFr}&price2={priceTo}&so1=&so2=&sk1=&sk2=&type%5B%5D=1&type%5B%5D=3&sorttype=0&sort_ord=0&{metros}&text='.format(**formData) formData["url"]=url req = urllib2.Request(url, headers={'User-Agent' : "Mozilla Firefox"}) f = urllib2.urlopen(req) doc = libxml2dom.parse(f, html=1) trs=doc.xpath('//table[@class="results"]/tr')[3:] formData["trs"] = [ [ td.textContent for td in tds] for tds in [tr.getElementsByTagName('td') for tr in trs] ] formData["stations"] = [[code, text, request.POST.getlist('stations').count(code)>0] for code, text in settings.SUBWAYS] return formData
def main(argv): #Extract arguments try: opts, args = getopt.getopt(argv, "hu:", ["help", "url="]) except getopt.GetoptError: usage() sys.exit(2) #Check we got some arguments if len( opts ) == 0: usage() sys.exit(2) #Parse command line arguments for opt, arg in opts: if opt in ("-h", "--help"): print_help() sys.exit() elif opt in ("-u", "--url"): url = arg #Open the HTML documment from blogger document = libxml2dom.parse(url, html=1) #Create the XPath expression to look for the entry content xpression = "//div[@class='post-body entry-content']//span" nodes = document.xpath( xpression ) #First print a href link to the blog post print '<a href="' + url + '"> Link to the blog post - Enlace a la pagina en el blog</a>' #For each node in the post content, check whether it is an img or plain textContent for i in nodes: #If img node if ( len( i.getElementsByTagName("img") ) > 0): print (i.getElementsByTagName("img")[0].toString()) print ("<br />") else: print (i.textContent) #Exit sys.exit()
def main(argv): #Extract arguments try: opts, args = getopt.getopt(argv, "hu:", ["help", "url="]) except getopt.GetoptError: usage() sys.exit(2) #Check we got some arguments if len(opts) == 0: usage() sys.exit(2) #Parse command line arguments for opt, arg in opts: if opt in ("-h", "--help"): print_help() sys.exit() elif opt in ("-u", "--url"): url = arg #Open the HTML documment from blogger document = libxml2dom.parse(url, html=1) #Create the XPath expression to look for the entry content xpression = "//div[@class='post-body entry-content']//span" nodes = document.xpath(xpression) #First print a href link to the blog post print '<a href="' + url + '"> Link to the blog post - Enlace a la pagina en el blog</a>' #For each node in the post content, check whether it is an img or plain textContent for i in nodes: #If img node if (len(i.getElementsByTagName("img")) > 0): print(i.getElementsByTagName("img")[0].toString()) print("<br />") else: print(i.textContent) #Exit sys.exit()
use_libxml2macro = "libxml2macro" in sys.argv iterations = [int(arg.split("-")[0]) for arg in sys.argv if arg.endswith("-times")] if len(ot_locations) == 0: print "Please specify the location of the ot.xml file." sys.exit(1) if len(iterations) == 0: iterations = 1 else: iterations = iterations[0] raw_input("Start your engines with ps -p %s -fv" % os.getpid()) t = time.time() for i in range(0, iterations): if use_libxml2macro: n_doc = parseFile(ot_locations[0]) l = test_begat_libxml2macro(n_doc, full_xpath) else: # use_libxml2dom: import libxml2dom doc = libxml2dom.parse(ot_locations[0]) l = test_begat_libxml2dom(doc, full_xpath) print "Time taken", time.time() - t raw_input("Stop your engines!") print l # vim: tabstop=4 expandtab shiftwidth=4
document = libxml2dom.createDocument(None, "doc", None) top = document.xpath("*")[0] elem1 = document.createElementNS("DAV:", "href") print "Namespace is", repr(elem1.namespaceURI) document.replaceChild(elem1, top) elem2 = document.createElementNS(None, "no_ns") print "Namespace is", repr(elem2.namespaceURI) document.xpath("*")[0].appendChild(elem2) print "Find href", len(document.xpath("href")) != 0 print "Find x:href", len(document.xpath("x:href", namespaces={"x": "DAV:"})) != 0 print "Find //no_ns", len(document.xpath("//no_ns")) != 0 print "Find x:href/no_ns", len(document.xpath("x:href/no_ns", namespaces={"x": "DAV:"})) != 0 print document.toString() document.toFile(open("test_ns.xml", "wb")) document = libxml2dom.parse("test_ns.xml") print "Namespace is", repr(document.xpath("*")[0].namespaceURI) print "Namespace is", repr(document.xpath("*/*")[0].namespaceURI) print "Find href", len(document.xpath("href")) != 0 print "Find x:href", len(document.xpath("x:href", namespaces={"x": "DAV:"})) != 0 print "Find //no_ns", len(document.xpath("//no_ns")) != 0 print "Find x:href/no_ns", len(document.xpath("x:href/no_ns", namespaces={"x": "DAV:"})) != 0 print document.toString() print "--------" print print "This is minidom's behaviour for default namespaces:" print document = xml.dom.minidom.Document() elem1 = document.createElementNS("DAV:", "href") print "Namespace is", repr(elem1.namespaceURI)
# 'django.template.loaders.eggs.load_template_source', ) MIDDLEWARE_CLASSES = ( 'django.middleware.common.CommonMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', ) ROOT_URLCONF = 'mysite.urls' TEMPLATE_DIRS = ( "/home/mike/django/mysite" # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates". # Always use forward slashes, even on Windows. # Don't forget to use absolute paths, not relative paths. ) INSTALLED_APPS = ( 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.sites', ) SUBWAYS = [[opt.getAttribute('value'),opt.textContent] for opt in libxml2dom.parse( urllib2.urlopen( urllib2.Request('http://www.bn.ru/zap_fl_w.phtml', headers={'User-Agent' : "Mozilla Firefox"})), html=1) .xpath('//select[@id="metro"]/option')]
#!/usr/bin/env python import libxml2dom schema = libxml2dom.parse("tests/test_valid_relaxng.xml") d = libxml2dom.parse("tests/test_valid.xml") print d.validate(schema) print d.validateDocument(schema) print d.getParameter("error-handler") schema = libxml2dom.parse("tests/test_invalid_relaxng.xml") d = libxml2dom.parse("tests/test_invalid.xml") print d.validate(schema) print d.validateDocument(schema) print d.getParameter("error-handler") # vim: tabstop=4 expandtab shiftwidth=4
sys.exit(1) if sys.argv[2] == "libxml2macro": x2_d = parseFile(sys.argv[1]) t = time.time() x2_d1, x2_d2 = test_import_libxml2macro(x2_d) toFile(x2_d2, "/tmp/xxx_libxml2macro.xml") print "Time", time.time() - t, "seconds" elif sys.argv[2] == "minidom": import xml.dom.minidom d = xml.dom.minidom.parse(sys.argv[1]) t = time.time() d1, d2 = test_import_minidom(d) open("/tmp/xxx_minidom.xml", "wb").write(d2.toxml("utf-8")) print "Time", time.time() - t, "seconds" elif sys.argv[2] == "libxml2dom": import libxml2dom d = libxml2dom.parse(sys.argv[1]) t = time.time() d1, d2 = test_import_libxml2dom(d) libxml2dom.toStream(d2, open("/tmp/xxx_libxml2dom.xml", "wb")) print "Time", time.time() - t, "seconds" # vim: tabstop=4 expandtab shiftwidth=4
#!/usr/bin/env python import libxml2dom schema = libxml2dom.parse("tests/test_valid_schematron.xml") d = libxml2dom.parse("tests/test_valid.xml") print d.validate(schema) print d.validateDocument(schema) print d.getParameter("error-handler") schema = libxml2dom.parse("tests/test_invalid_schematron.xml") d = libxml2dom.parse("tests/test_invalid.xml") print d.validate(schema) print d.validateDocument(schema) print d.getParameter("error-handler") # vim: tabstop=4 expandtab shiftwidth=4