def NewParser(config={}): pyRXPLock.acquire() res = pyRXPU.Parser() for k in config.keys(): setattr(res, k, config[k]) pyRXPLock.release() return res
def goodTest(x,t,tb=0,inOnly=0,**kw): try: P=pyRXPU.Parser(**kw) r = P(x) rb = 0 except: et, ev, _unused = sys.exc_info() sev = ascii(ev) sev = sev[6:-3 if sev.endswith(",)") else -2] if sev.startswith('u'): sev = sev[1:] r = '%s %s' % (et.__name__, sev[1:]) rb = 1 s = '' for k,v in kw.items(): s = s+', %s=%s' % (k,str(v)) if type(t) is type(''): t = t.replace('\r','\\r') t = t.replace('\n','\\n') if type(r) is type(''): r = r.replace('\r','\\r') r = r.replace('\n','\\n') plog('%s.Parser(%s)(%s)'%(pyRXPU.__name__,s[2:],repr(x))) if (inOnly and t in r) or (r==t) and rb==tb: plogn('OK') _dot('.') else: _dot('E') plogn('\nBAD got %s' % repr(r)) plogn('Expected %s' % repr(t))
def xml2doctree(xml): pyRXP_parse = pyRXP.Parser( ErrorOnValidityErrors=1, NoNoDTDWarning=1, ExpandCharacterEntities=0, ExpandGeneralEntities=0) return pyRXP_parse.parse(xml)
def parse(self,filename,**kw): if debug&2: print('##### About to parse %s' % filename, file=sys.stderr) kw = kw.copy() kw['ReturnComments'] = 0 kw['ExpandEmpty'] = 1 kw['XMLLessThan'] = 1 kw['ReturnProcessingInstructions'] = 1 kw['ReturnList'] = 1 if debug&4: def eocb(s): print(f'+++++ eocb({s!r})',file=sys.stderr,flush=True) return s kw['eoCB'] = eocb parser = pyRXPU.Parser(**kw) # Change directory in case we are loading entities from cwd retdir = os.getcwd() d,n = os.path.split(filename) os.chdir(d) try: with open(n,'rb') as f: xml = f.read() r = parser.parse(xml) if debug: print(f'##### r={ascii(r)}',file=sys.stderr,flush=True) return r finally: os.chdir(retdir) if debug&2: print('Done parsing %s' % filename, file=sys.stderr,flush=True) if debug&2: print('='*60, file=sys.stderr,flush=True) if debug&16: time.sleep(1)
def validateXhtml(source, rootElement="html", wrapIfNeeded=True): """Validates against canned DTD and returns the tuple-tree, or an exception. It uses a canned copy of the standard in reportPackages.rlextra/dtd. You don't have to supply a whole document; if you want to check that the content is a valid paragraph, supply 'p' as rootElement. By default it's a little bit forgiving and will add the rootElement if needed at the start or the finish. This is useful with tinyMCE text. If you turn off wrapIfNeeded, it will assume your tag is there. >>> t = validateXhtml('''<html><head><title>hello</title></head><body></body></html>''') >>> t = validateXhtml('''<html><head this="unexpected"><title>hello</title></head><body></body></html>''') #doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): error: Error: Undeclared attribute this for element head in unnamed entity at line 2 char 17 of [unknown] Undeclared attribute this for element head Parse Failed! <BLANKLINE> >>> # now for some intra-paragraph stuff - you can validate any tag >>> t = validateXhtml('''<p>Normal<i>text</i> here</p>''', rootElement='p') >>> t = validateXhtml('''<p>Normal<i>text</i> here, but no <p>paras</p>!</p>''', rootElement='p') #doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): error: Error: Content model for p does not allow element p here in unnamed entity at line 2 char 37 of [unknown] Content model for p does not allow element p here Parse Failed! <BLANKLINE> #check it can supply the missing p tag with intra-paragraph content, at start or end #as one can get from tinyMCE, with wrapIfNeeded. >>> t = validateXhtml('''Normal<i>text</i> here''', rootElement='p') >>> t = validateXhtml('''Missing a lead p.</p>''', rootElement='p') >>> t = validateXhtml('''<p>Missing a trailing p.''', rootElement='p') """ if wrapIfNeeded: #we can cope with missing root element at beginning and end. #but if they supplied a doctype or xml declaration, we assume it's #complete. source = source.strip() if source.startswith('<!DOCTYPE') or source.startswith('<?'): pass else: if not source.startswith("<" + rootElement): source = "<%s>%s" % (rootElement, source) if not source.endswith(rootElement + ">"): source = "%s</%s>" % (source, rootElement) source = applyDocType(source.strip(), rootElement, 'xhtml1-strict.dtd') import pyRXPU from reportPackages.rlextra.radxml import xhtml p = pyRXPU.Parser(eoCB=xhtml.openEntity) #try: tree = p.parse(source) #except: #ought to raise an exception with the offending text return tree
def validateXhtmlDocument(docText): """Parse, with validation, an XHTML 1.0 Strict document and return pyRXPU tuple tree. Raises pyRXPU.error on failure. >>> import pyRXPU >>> r = validateXhtmlDocument('''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-strict.dtd"> ... <html> ... <head><title>Title</title></head> ... <body><p></p></body> ... </html> ... ''') >>> r = validateXhtmlDocument('''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-strict.dtd"> ... <boo></boo> ... ''') Traceback (most recent call last): error: Error: Start tag for undeclared element boo in unnamed entity at line 3 char 5 of [unknown] Start tag for undeclared element boo Parse Failed! <BLANKLINE> XHTML Transitional attributes like bgcolor are NOT OK: >>> r = validateXhtmlDocument('''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-strict.dtd"> ... <html> ... <head><title>Title</title></head> ... <body><p><img border="1"></p></body> ... </html> ... ''') Traceback (most recent call last): error: Error: Undeclared attribute border for element img in unnamed entity at line 5 char 21 of [unknown] Undeclared attribute border for element img Parse Failed! <BLANKLINE> It's an error to supply a document that's declared to be XHTML transitional: >>> try: ... r = validateXhtmlDocument('''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-transitional.dtd"> ... <html> ... <head><title>Title</title></head> ... <body><p></p></body> ... </html> ... ''') ... except pyRXPU.error: ... pass ... else: ... print("expected pyRXPU.error") """ import pyRXPU, xhtml p = pyRXPU.Parser(eoCB=xhtml.openEntity) return p.parse(docText)
def validateXhtmlTransitionalDocument(docText): """Parse, with validation, an XHTML 1.0 Transitional document and return pyRXPU tuple tree. Raises pyRXPU.error on failure. >>> import pyRXPU >>> r = validateXhtmlTransitionalDocument(b'''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-transitional.dtd"> ... <html> ... <head><title>Title</title></head> ... <body><p></p></body> ... </html> ... ''') >>> r = validateXhtmlTransitionalDocument(b'''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-transitional.dtd"> ... <boo></boo> ... ''') ... #doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): error: Error: Start tag for undeclared element boo in unnamed entity at line 3 char 5 of [unknown] Start tag for undeclared element boo Parse Failed! <BLANKLINE> XHTML Transitional attributes like border are OK: >>> r = validateXhtmlTransitionalDocument(b'''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-transitional.dtd"> ... <html> ... <head><title>Title</title></head> ... <body><p><img src="blah" alt="blah" border="1"></img></p></body> ... </html> ... ''') It's an error to supply a document that's declared to be XHTML strict: >>> try: ... r = validateXhtmlTransitionalDocument('''<?xml version="1.0" encoding="UTF-8"?> ... <!DOCTYPE html SYSTEM "xhtml1-strict.dtd"> ... <html> ... <head><title>Title</title></head> ... <body><p></p></body> ... </html> ... ''') ... except pyRXPU.error: ... pass ... else: ... print("expected pyRXPU.error") """ import pyRXPU from reportPackages.rlextra.radxml import xhtml p = pyRXPU.Parser(eoCB=xhtml.openEntityTransitional) return p.parse(docText)
def test1(n): print('starting test1') import pyRXPU, sys xml = '<start><tag1>text here</tag1><tag1>more text</tag1></start>' P = i = tup = None for i in range(n): P = pyRXPU.Parser() tup = P(xml) sys.stdout.write('.') del n, pyRXPU, sys, P, tup, xml print('\ntest1 done')
def findImages(xml): "Returns lists of all images referred to in markup" xml = asUnicode(xml) validDoc = xhtmlDocFromXhtmlFragment( xml) #adds 'html','head etc. to our fragment parser = pyRXPU.Parser(ReturnUTF8=1, ExpandCharacterEntities=0, ExpandGeneralEntities=0) tree = parser.parse(validDoc) walker = ImageFinder(tree) walker.go() return walker.images
def xml2rad(xml, validating=1, eoCB=None): '''convert xml to radxml form''' if validating: from reportlab.lib.utils import isCompactDistro, open_and_read if not eoCB and isCompactDistro(): eoCB = lambda x, open_and_read=open_and_read: (x, open_and_read(x)) import pyRXPU rad = pyRXPU.Parser().parse(xml, eoCB=eoCB) else: from reportlab.lib import rparsexml rad = rparsexml.parsexml0(xml)[0][2][0] return rad
def test(fn): cwd = os.getcwd() try: print('BEGIN') t0 = time.time() xml = open(fn,'r').read() os.chdir(os.path.dirname(fn)) t1 = time.time() D = pyRXP.Parser().parse(xml) t2 = time.time() print('END read took %7.2f" parse took %7.2f"' %(t1-t0,t2-t1)) return D finally: os.chdir(cwd)
def parse_catalog(filename): """Validate and parse XML. This will complain if invalid We fully parse the XML and turn into Python variables, so that any encoding issues are confronted here rather than in the template """ xml = open(filename).read() if isUnicode(xml): xml = xml.encode('utf8') #required for python 2.7 & >=3.3 p = pyRXPU.Parser() tree = p.parse(xml) tagTree = TagWrapper(tree) request_a_quote = [109, 110, 4121, 4122, 4123] # we now need to de-duplicate; the query returns multiple rows with different images # in them. if id is same, assume it's the same product. ids_seen = set() products = [] for prodTag in tagTree: id = int(str(prodTag.ProductId1)) #extract tag content if id in ids_seen: continue else: ids_seen.add(id) prod = Product() prod.id = id prod.modelNumber = int(str(prodTag.ModelNumber)) prod.archived = (str(prodTag.Archived) == 'true') prod.name = fix(prodTag.ModelName) prod.summary = fix(prodTag.Summary) prod.description = fix(prodTag.Description) #originally the images came from a remote site. We have stashed them in #the img/ subdirectory, so just chop off the final part of the path. #asNative required for python 2.7 & >=3.3 prod.image = os.path.split(asNative(fix( prodTag.ImageUrl)))[-1].replace(' ', '') if prod.modelNumber in request_a_quote: prod.price = "Call us on 01635 246830 for a quote" else: prod.price = '£' + str( prodTag.UnitCost)[0:len(str(prodTag.UnitCost)) - 2] if prod.archived: pass else: products.append(prod) products.sort(key=lambda x: x.modelNumber) return products
def parse_catalog(filename): """Validate and parse XML. This will complain if invalid We fully parse the XML and turn into Python variables, so that any encoding issues are confronted here rather than in the template """ xml = open(filename).read() p = pyRXPU.Parser() tree = p.parse(xml) tagTree = TagWrapper(tree) request_a_quote = [109, 110, 4121, 4122, 4123] # we now need to de-duplicate; the query returns multiple rows with different images # in them. if id is same, assume it's the same product. ids_seen = set() products = [] for prodTag in tagTree: id = int(str(prodTag.ProductId1)) #extract tag content if id in ids_seen: continue else: ids_seen.add(id) prod = Product() prod.id = id prod.modelNumber = int(str(prodTag.ModelNumber)) prod.archived = (str(prodTag.Archived) == 'true') prod.name = fix(prodTag.ModelName) prod.summary = fix(prodTag.Summary) prod.description = fix(prodTag.Description) if prod.modelNumber in request_a_quote: prod.price = "Call us on 01635 246830 for a quote" else: prod.price = '£' + str( prodTag.UnitCost)[0:len(str(prodTag.UnitCost)) - 2] if prod.archived: pass else: products.append(prod) products.sort(key=lambda x: x.modelNumber) return products
start = time.clock() tree = p.parseText(xmlText) finish = time.clock() print('parsed %d bytes in %0.4f seconds' % (len(xmlText), finish-start)) if small: print('expat tree:') pp(tree) print('Try 2: with pyRXP based treebuilder') try: import pyRXPU except ImportError: sys.exit() print() #try comparison p = pyRXPU.Parser() start = time.clock() tree2 = p.parse(xmlText) finish = time.clock() print('parsed %d bytes in %0.4f seconds' % (len(xmlText), finish-start)) if small: print('pyRXP tree:') pp(tree2) if tree == tree2: print('Exact match!') else: print('trees differ!') if not small: pp(tree,stream=open('/tmp/expat_tree.txt','w')) pp(tree2,stream=open('/tmp/pyrxp_tree.txt','w'))
def xhtml2rml(xml, paraStyle='normal', tableStyle='noPaddingStyle', bulletStyle='bullet', pathTransform=None, imageTransformKwds={}, allowMailtoLinks=False): """Convert chunk of our mini-html to RML. >>> xhtml2rml('text')=='text' #avoid spurious whitespace True >>> xhtml2rml('2m * 2m = 4m<sup>2</sup>')=='2m * 2m = 4m<sup>2</sup>' #why it matters - space appears! True >>> xhtml2rml('<p>test</p>')=='<para style="normal">test</para>' True >>> xhtml2rml('<p>test</p>', paraStyle='custom')=='<para style="custom">test</para>' True >>> from rlextra.radxml.html_cleaner import cleanBlocks >>> xhtml2rml(cleanBlocks('<p>aaaa <img src="rml:img1">bbbb<p>cccc<img src="rml:img2"> dddd</p> eeeee</p>'))=='<imageAndFlowables imageName="rml:img1" imageSide="left"><para style="normal">aaaa bbbb</para></imageAndFlowables><imageAndFlowables imageName="rml:img2" imageSide="left"><para style="normal">cccc dddd</para></imageAndFlowables><para style="normal"> eeeee</para>' True """ xml = xhtmlDocFromXhtmlFragment(xml) M = MapController() M[""] = '%(__content__)s' M["html"] = '%(__content__)s' M["body"] = '%(__content__)s' M["head"] = '%(__content__)s' M["title"] = '%(__content__)s' M["p"] = '%(__content__)s' M["p"].transformContent(curry(pTransform, paraStyle=paraStyle)) M["img"] = MapNode(None, '%(__attrs__)s') if pathTransform: imageTransformKwds['pathTransform'] = pathTransform M["img"].addTransform('__attrs__', ImageTransform(**imageTransformKwds)) M["table"] = '<blockTable style="' + tableStyle + '">%(__content__)s</blockTable>' M["tr"] = '<tr>%(__content__)s</tr>' M["td"] = '<td><para style="' + paraStyle + '">%(__content__)s</para></td>' M["th"] = '<td><para style="' + paraStyle + '">%(__content__)s</para></td>' M["b"] = '<b>%(__content__)s</b>' M["i"] = '<i>%(__content__)s</i>' M["u"] = '<u>%(__content__)s</u>' M["sup"] = '<sup>%(__content__)s</sup>' M["sub"] = '<sub>%(__content__)s</sub>' M["strong"] = MapNode('<b>%(__content__)s</b>', '') M["em"] = '<i>%(__content__)s</i>' M["br"] = MapNode(None, '<br/>') M["h1"] = '<para style="h1">%(__content__)s</para>' M["h2"] = '<para style="h2">%(__content__)s</para>' M["h3"] = '<para style="h3">%(__content__)s</para>' M["h4"] = '<para style="h4">%(__content__)s</para>' M["h5"] = '<para style="h5">%(__content__)s</para>' M["h6"] = '<para style="h6">%(__content__)s</para>' M["ul"] = '%(__content__)s' M["ol"] = '%(__content__)s' M["li"] = '<para style="' + bulletStyle + '"><bullet>•</bullet>%(__content__)s</para>' M["address"] = '<para style="' + paraStyle + '">%(__content__)s</para>' M["span"] = '%(__content__)s' M["a"] = MapNode('%(__attrs__)s', None) if allowMailtoLinks: M["a"].addTransform('__attrs__', aTransformMailto) else: M["a"].addTransform('__attrs__', aTransform) parser = pyRXPU.Parser(ExpandCharacterEntities=0, ExpandGeneralEntities=0) xml = parser.parse(xml) return M.process(xml, isTupleTree=True)
def xhtml2rml( xml, paraStyle='normal', tableStyle='noPaddingStyle', bulletStyle='bullet', pathTransform=None, imageTransformKwds={}, allowMailtoLinks=False, useModernLists=True, ulStyle=None, olStyle=None, liParaStyle=None, tagAttrs={}, ): """Convert chunk of our mini-html to RML. >>> xhtml2rml('text')=='text' #avoid spurious whitespace True >>> xhtml2rml('2m * 2m = 4m<sup>2</sup>')=='2m * 2m = 4m<sup>2</sup>' #why it matters - space appears! True >>> xhtml2rml('<p>test</p>')=='<para style="normal">test</para>' True >>> xhtml2rml('<h1>test</h1>',tagAttrs=dict(h1=dict(style='myh1style')))=='<para style="myh1style">test</para>' True >>> xhtml2rml('<h6>test</h6>',tagAttrs=dict(h6=dict(style='myh6style')))=='<para style="myh6style">test</para>' True >>> xhtml2rml('<p>test</p>', paraStyle='custom')=='<para style="custom">test</para>' True >>> from rlextra.radxml.html_cleaner import cleanBlocks >>> xhtml2rml(cleanBlocks('<p>aaaa <img src="rml:img1">bbbb<p>cccc<img src="rml:img2"> dddd</p> eeeee</p>'))=='<imageAndFlowables imageName="rml:img1" imageSide="left"><para style="normal">aaaa bbbb</para></imageAndFlowables><imageAndFlowables imageName="rml:img2" imageSide="left"><para style="normal">cccc dddd</para></imageAndFlowables><para style="normal"> eeeee</para>' True >>> xhtml2rml(cleanBlocks('<ol><li>one</li><li>two</li></ol>'), olStyle='xxx')=='<ol style="xxx"><li><para style="normal">one</para></li><li><para style="normal">two</para></li></ol>' True >>> xhtml2rml(cleanBlocks('<ul><li>one</li><li>two</li></ul>'), ulStyle='xxx')=='<ul style="xxx"><li><para style="normal">one</para></li><li><para style="normal">two</para></li></ul>' True >>> xhtml2rml(cleanBlocks('<ul><li>one</li><li>two</li></ul>'), ulStyle='xxx', tagAttrs=dict(li=dict(bulletOffsetY=2)))=='<ul style="xxx"><li bulletOffsetY="2"><para style="normal">one</para></li><li bulletOffsetY="2"><para style="normal">two</para></li></ul>' True """ xml = xhtmlDocFromXhtmlFragment(xml) M = MapController() tagAttrs = tagAttrs.copy() M[""] = '%(__content__)s' M["html"] = '%(__content__)s' M["body"] = '%(__content__)s' M["head"] = '%(__content__)s' M["title"] = '%(__content__)s' M["p"] = '%(__content__)s' M["p"].transformContent(curry(pTransform, paraStyle=paraStyle)) M["img"] = MapNode(None, '%(__attrs__)s') if pathTransform: imageTransformKwds['pathTransform'] = pathTransform M["img"].addTransform('__attrs__', ImageTransform(**imageTransformKwds)) M["table"] = '<blockTable%s>%%(__content__)s</blockTable>' % extractTagAttrs( 'table', tagAttrs, style=tableStyle) M["tr"] = '<tr>%%(__content__)s</tr>' paraAttrs = extractTagAttrs('para', tagAttrs, style=paraStyle) attrs = (extractTagAttrs('td', tagAttrs), paraAttrs) M["td"] = '<td%s><para%s>%%(__content__)s</para></td>' % attrs M["th"] = '<td%s><para%s>%%(__content__)s</para></td>' % attrs M["b"] = '<b>%(__content__)s</b>' M["i"] = '<i>%(__content__)s</i>' M["u"] = '<u>%(__content__)s</u>' M["sup"] = '<sup>%(__content__)s</sup>' M["sub"] = '<sub>%(__content__)s</sub>' M["strong"] = MapNode('<b>%(__content__)s</b>', '') M["em"] = '<i>%(__content__)s</i>' M["br"] = MapNode(None, '<br/>') for n in '123456': tag = 'h' + n weakUpdateTagAttrs(tag, tagAttrs, style=tag) M[tag] = '<para%s>%%(__content__)s</para>' % extractTagAttrs( tag, tagAttrs) if useModernLists: liParaStyle = liParaStyle or paraStyle M["ul"] = '<ul%s>%%(__content__)s</ul>' % extractTagAttrs( 'ul', tagAttrs, style=ulStyle) M["ol"] = '<ol%s>%%(__content__)s</ol>' % extractTagAttrs( 'ol', tagAttrs, style=olStyle) M["li"] = '<li%s><para%s>%%(__content__)s</para></li>' % ( extractTagAttrs('li', tagAttrs), extractTagAttrs('para', tagAttrs, style=liParaStyle)) else: #oldstyle for now M["ul"] = '%(__content__)s' M["ol"] = '%(__content__)s' M["li"] = '<para%s><bullet>•</bullet>%%(__content__)s</para>' % extractTagAttrs( 'bulletPara', tagAttrs, style=bulletStyle) M["address"] = '<para%s>%%(__content__)s</para>' % paraAttrs M["span"] = '%(__content__)s' M["a"] = MapNode('%(__attrs__)s', None) if allowMailtoLinks: M["a"].addTransform('__attrs__', aTransformMailto) else: M["a"].addTransform('__attrs__', aTransform) parser = pyRXPU.Parser(ExpandCharacterEntities=0, ExpandGeneralEntities=0) xml = parser.parse(xml) return M.process(xml, isTupleTree=True)
def _runTests(): plogn('############# Testing %s=%8.8X'%(pyRXPU.__name__,id(pyRXPU))) try: for k,v in pyRXPU.parser_flags.items(): eval('pyRXPU.Parser(%s=%d)' % (k,v)) plogn('Parser keywords OK') _dot('.') except: traceback.print_exc() plogn('Parser keywords BAD') _dot('E') try: for k,v in pyRXPU.parser_flags.items(): eval('pyRXPU.Parser()("<a/>",%s=%d)' % (k,v)) plogn('Parser().parse keywords OK') _dot('.') except: traceback.print_exc() plogn('Parser().parse keywords BAD') _dot('E') try: P=pyRXPU.Parser() plog('Parser()=%r' % P) plog('Parser().__class__=%r' % P.__class__) plog('type(Parser())=%r\n\n' % type(P)) del P plogn('Parser().__class__ etc OK') _dot('.') except: traceback.print_exc() plogn('Parser().__class__ etc BAD') _dot('E') goodTest('<a></a>',('a', None, [], None)) goodTest('<a></a>',('a', {}, [], None),ExpandEmpty=1) goodTest('<a></a>',['a', None, [], None],MakeMutableTree=1) goodTest('<a/>',('a', None, None, None)) goodTest('<a/>',('a', {}, [], None),ExpandEmpty=1) goodTest('<a/>',['a', None, None, None],MakeMutableTree=1) goodTest('<a/>',['a', {}, [], None],ExpandEmpty=1,MakeMutableTree=1) failTest('</a>',"error Error: End tag </a> outside of any element\n in unnamed entity at line 1 char 4 of [unknown]\nEnd tag </a> outside of any element\nParse Failed!\n") goodTest('<a>A<!--comment--></a>',('a', None, ['A'], None)) goodTest('<a>A<!--comment--></a>',('a', {}, ['A'], None),ExpandEmpty=1) goodTest('<a>A<!--comment--></a>', ('a', None, ['A', ('<!--', None, ['comment'], None)], None), ReturnComments=1) goodTest('<a>A<&></a>',('a', None, ['A<&>'], None)) goodTest('<a>A<&></a>',('a', None, ['A', '<', '&', '>'], None), MergePCData=0) goodTest('<!--comment--><a/>',('a', None, None, None),ReturnComments=1) goodTest('<!--comment--><a/>',[('<!--',None,['comment'],None),('a', None, None, None)],ReturnComments=1,ReturnList=1) goodTest('<!--comment--><a/>',('a', None, None, None),ReturnComments=1) failTest('<?xml version="1.0" encoding="LATIN-1"?></a>',"error Unknown declared encoding LATIN-1\nInternal error, ParserPush failed!\n") goodTest('<?work version="1.0" encoding="utf-8"?><a/>',[('<?',{'name':'work'}, ['version="1.0" encoding="utf-8"'],None), ('a', None, None, None)],IgnorePlacementErrors=1,ReturnList=1,ReturnProcessingInstructions=1,ReturnComments=1) goodTest('<a>\nHello\n<b>cruel\n</b>\nWorld\n</a>',('a', None, ['\nHello\n', ('b', None, ['cruel\n'], (('aaa', 2, 3), ('aaa', 3, 4))), '\nWorld\n'], (('aaa', 0, 3), ('aaa', 5, 4))),fourth=pyRXPU.recordLocation,srcName='aaa') goodTest('<a aname="ANAME" aother="AOTHER">\nHello\n<b bname="BNAME" bother="BOTHER">cruel\n</b>\nWorld\n</a>',('a', {"aname": "ANAME", "aother": "AOTHER"}, ['\nHello\n', ('b', {"bname": "BNAME", "bother": "BOTHER"}, ['cruel\n'], (('aaa', 2, 33), ('aaa', 3, 4))), '\nWorld\n'], (('aaa', 0, 33), ('aaa', 5, 4))),fourth=pyRXPU.recordLocation,srcName='aaa') goodTest('<a><![CDATA[<a>]]></a>',('a', None, ['<a>'], None)) goodTest('<a><![CDATA[<a>]]></a>',('a', None, [('<![CDATA[', None, ['<a>'], None)], None),ReturnCDATASectionsAsTuples=1) goodTest('''<foo:A xmlns:foo="http://www.foo.org/"><foo:B><foo:C xmlns:foo="http://www.bar.org/"><foo:D>abcd</foo:D></foo:C></foo:B><foo:B/><A>bare A<C>bare C</C><B>bare B</B></A><A xmlns="http://default.reportlab.com/" xmlns:bongo="http://bongo.reportlab.com/">default ns A<bongo:A>bongo A</bongo:A><B>default NS B</B></A></foo:A>''',('{http://www.foo.org/}A', {'xmlns:foo': 'http://www.foo.org/'}, [('{http://www.foo.org/}B', None, [('{http://www.bar.org/}C', {'xmlns:foo': 'http://www.bar.org/'}, [('{http://www.bar.org/}D', None, ['abcd'], None)], None)], None), ('{http://www.foo.org/}B', None, None, None), ('A', None, ['bare A', ('C', None, ['bare C'], None), ('B', None, ['bare B'], None)], None), ('{http://default.reportlab.com/}A', {'xmlns': 'http://default.reportlab.com/', 'xmlns:bongo': 'http://bongo.reportlab.com/'}, ['default ns A', ('{http://bongo.reportlab.com/}A', None, ['bongo A'], None), ('{http://default.reportlab.com/}B', None, ['default NS B'], None)], None)], None),XMLNamespaces=1,ReturnNamespaceAttributes=1) failTest(bigDepth(257),"""error Internal error, stack limit reached!\n""", inOnly=1) failTest('<a>Áá</a>','error Error: Undefined entity Aacute\n in unnamed entity at line 1 char 12 of [unknown]\nUndefined entity Aacute\nParse Failed!\n') goodTest('<a>Á</a>',('a', None, ['\xc1'], None), ugeCB=ugeCB) filename = os.path.join(os.getcwd(),'not-there.dtd').replace(os.sep,'/') if filename.startswith('/'): filename = filename[1:] failTest('<!DOCTYPE foo SYSTEM "not-there.dtd"><foo>foo<a>aaa</a>fum</foo>',"error Error: Couldn't open dtd entity file:///%(filename)s\\n in unnamed entity at line 1 char 38 of [unknown]"%vars(),NoNoDTDWarning=0,inOnly=1) failTest('<!DOCTYPE foo SYSTEM "is-there.dtd"><foo><a>aaa</a></foo>','error Error: Content model for foo does not allow it to end here\\n in unnamed entity at line 1 char 57 of [unknown]',inOnly=1,NoNoDTDWarning=0) goodTest('<!DOCTYPE foo SYSTEM "is-there.dtd"><foo><a>aaa</a><b>bbbb</b></foo>',('foo', None, [('a', None, ['aaa'], None), ('b', None, ['bbbb'], None)], None),NoNoDTDWarning=0) failTest('<!DOCTYPE foo SYSTEM "is-there.dtd"><foo><a>aaa</a></foo>','error Error: Content model for foo does not allow it to end here',inOnly=1,NoNoDTDWarning=0,eoCB=eoDTD) goodTest('<!DOCTYPE foo SYSTEM "is-there.dtd"><foo><a>aaa</a><b>bbbb</b></foo>',('foo', None, [('a', None, ['aaa'], None), ('b', None, ['bbbb'], None)], None),NoNoDTDWarning=0,eoCB=eoDTD) goodTest('<!DOCTYPE foo SYSTEM "have-utf8-content.dtd"><foo><a>aaa</a><b>bbbb</b></foo>',('foo', None, [('a', None, ['aaa'], None), ('b', None, ['bbbb'], None)], None),NoNoDTDWarning=0,eoCB=eoDTD) goodTest('<!DOCTYPE foo SYSTEM "have-unicode-content.dtd"><foo><a>aaa</a><b>bbbb</b></foo>',('foo', None, [('a', None, ['aaa'], None), ('b', None, ['bbbb'], None)], None),NoNoDTDWarning=0,eoCB=eoDTD) filename = os.path.join(os.getcwd(),'really-not-there.dtd').replace(os.sep,'/') if filename.startswith('/'): filename = filename[1:] failTest('<!DOCTYPE foo SYSTEM "not-there.dtd"><foo>foo<a>aaa</a>fum</foo>',"error Error: Couldn't open dtd entity file:///%(filename)s\\n in unnamed entity at line 1 char 38 of [unknown]\\n"%vars(),NoNoDTDWarning=0,eoCB=eoDTD,inOnly=1) failTest('<!DOCTYPE foo SYSTEM "badt-have-utf8-content.dtd"><foo><a>aaa</a><b>bbbb</b></foo>',"error Error: Couldn't open dtd entity badt-have-utf8-content.dtd\\n in unnamed entity at line 1 char 51 of [unknown]",inOnly=1,NoNoDTDWarning=0,eoCB=eoDTD) failTest('<!DOCTYPE foo SYSTEM "badt-have-unicode-content.dtd"><foo><a>aaa</a><b>bbbb</b></foo>',"error Error: Couldn't open dtd entity badt-have-unicode-content.dtd\\n in unnamed entity at line 1 char 54 of [unknown]",inOnly=1,NoNoDTDWarning=0,eoCB=eoDTD)
def getNonValidatingPyRXPParser(): import pyRXPU p = pyRXPU.Parser(Validate=0) return p
def run2(): import pyRXPU, sys p = pyRXPU.Parser(eoCB=ident) for i in range(n): tree = p.parse(src) sys.stdout.write('.')
def InsertMultipleDQXMLFileThreaded(filenames, logger, server='http://slwebtest.virgo.infn.it', hackDec11=False, debug=True, threads=1, testing_options={}): """ Inserts multiple dqxml files of data into the DQSEGDB. Input: - filenames is a list of string filenames for DQXML files. - hackDec11 is deprecated (always should be false): This was used to differentiate function against different server APIs before we used numbering an responses to make decisions. - testing_options is a dictionary including (optionally):offset(int),synchronize(time in 'HH:MM' format (string)) Output: returns True if it completes sucessfully """ logger.info( "Beginning call to InsertMultipleDQXMLFileThreaded. This message last updated April 14 2015, Ciao da Italia!" ) from threading import Thread from Queue import Queue import sys # Make a call to server+'/dq': protocol = server.split(':')[0] serverfqdn = server.split('/')[-1] apiResult = queryAPIVersion(protocol, serverfqdn, False) # If the API change results in a backwards incompatibility, handle it here with a flag that affects behavior below if apiResult >= "2.1.0": # S6 style comments are needed new_comments = True else: # Older server, so don't want to supply extra comments... new_comments = False if apiResult >= "2.1.15": # Alteration to insertion_metadata from uri to comment to accomodate s6 data conversion use_new_insertion_metadata = True else: use_new_insertion_metadata = False if 'offset' in testing_options: offset = int(testing_options['offset']) else: offset = 0 if 'synchronize' in testing_options: synchronize = testing_options['synchronize'] xmlparser = pyRXP.Parser() lwtparser = ldbd.LIGOLwParser() flag_versions = {} # flag_versions, filename, server, hackDec11, debug are current variables # This next bunch of code is specific to a given file: if len(filenames) < 1: print "Empty file list sent to InsertMultipleDQXMLFileThreaded" raise ValueError for filename in filenames: segment_md = setupSegment_md(filename, xmlparser, lwtparser, debug) # segment_md, flag_versions, filename, server, hackDec11, debug are current variables flag_versions_numbered = {} for j in range(len(segment_md.table['segment_definer']['stream'])): flag_versions_numbered[j] = {} for i, entry in enumerate( segment_md.table['segment_definer']['orderedcol']): #print j,entry,segment_md.table['segment_definer']['stream'][j][i] flag_versions_numbered[j][entry] = segment_md.table[ 'segment_definer']['stream'][j][i] # parse process table and make a dict that corresponds with each # process, where the keys for the dict are like "process:process_id:1" # so that we can match # these to the flag_versions from the segment definer in the next # section # Note: Wherever temp_ preceeds a name, it is generally an identifier # field from the dqxml, that is only good for the single dqxml file # being parsed process_dict = {} # Going to assign process table streams to process_dict with a key # matching process_id (process:process_id:0 for example) for j in range(len(segment_md.table['process']['stream'])): process_id_index = segment_md.table['process']['orderedcol'].index( 'process_id') temp_process_id = segment_md.table['process']['stream'][j][ process_id_index] # Now we're going to assign elements to process_dict[process_id] process_dict[temp_process_id] = {} for i, entry in enumerate( segment_md.table['process']['orderedcol']): #print j,entry,segment_md.table['process']['stream'][j][i] process_dict[temp_process_id][entry] = segment_md.table[ 'process']['stream'][j][i] # Note that the segment_md.table['process']['stream'][0] looks like this: #0 program SegGener #0 version 6831 #0 cvs_repository https://redoubt.ligo-wa.caltech.edu/ #0 svn/gds/trunk/Monitors/SegGener/SegGener.cc #0 cvs_entry_time 1055611021 #0 comment Segment generation from an OSC condition #0 node l1gds2 #0 username [email protected] #0 unix_procid 24286 #0 start_time 1065916603 #0 end_time 1070395521 #0 process_id process:process_id:0 #0 ifos L0L1 # So now I have all of that info stored by the process_id keys # Eventually I have to map these elements to the process_metadata # style.. maybe I can do that now: process_dict[temp_process_id]['process_metadata'] = {} if hackDec11: process_dict[temp_process_id]['process_metadata'][ 'process_start_time'] = process_dict[temp_process_id][ 'start_time'] else: # This is for the newer server APIs: (April 24 2015 we checked it (it probably changed before ER6 finally)) process_dict[temp_process_id]['process_metadata'][ 'process_start_timestamp'] = process_dict[temp_process_id][ 'start_time'] if new_comments: process_dict[temp_process_id][ 'process_comment'] = process_dict[temp_process_id][ 'comment'] process_dict[temp_process_id]['process_metadata'][ 'uid'] = process_dict[temp_process_id]['username'] process_dict[temp_process_id]['process_metadata']['args'] = [ ] ### Fix!!! dqxml has no args??? process_dict[temp_process_id]['process_metadata'][ 'pid'] = process_dict[temp_process_id]['unix_procid'] process_dict[temp_process_id]['process_metadata'][ 'name'] = process_dict[temp_process_id]['program'] process_dict[temp_process_id]['process_metadata'][ 'fqdn'] = process_dict[temp_process_id][ 'node'] ### Fix!!! Improvement: not really fqdn, just the node name # So now I have process_dict[temp_process_id]['process_metadata'] for each # process_id, and can add it to a flag version when it uses it; really I # should group it with the segment summary info because that has the # insertion_metadata start and stop time ### Fix!!! Get the args from the *other* process table... yikes ### Double check what is done below works! # First pass: #if debug: # import pdb # pdb.set_trace() temp_process_params_process_id = None try: len(segment_md.table['process_params']['stream']) except: logger.info("No process_params table for file: %s" % filename) else: for j in range(len(segment_md.table['process_params']['stream'])): process_id_index = segment_md.table['process_params'][ 'orderedcol'].index('process_id') temp_process_params_process_id = segment_md.table[ 'process_params']['stream'][j][process_id_index] # This next bit looks a bit strange, but the goal is to pull off only the param and value from each row of the process_params table, and then put them into the process_metadata # Thus we loop through the columns in each row and toss out everything but the param and value entries, and then outside the for loop, append them to the args list for i, entry in enumerate( segment_md.table['process_params']['orderedcol']): if entry == "param": temp_param = str( segment_md.table['process_params']['stream'][j][i]) if entry == "value": temp_value = str( segment_md.table['process_params']['stream'][j][i]) process_dict[temp_process_params_process_id][ 'process_metadata']['args'].append(str(temp_param)) process_dict[temp_process_params_process_id][ 'process_metadata']['args'].append(str(temp_value)) #if debug: # import pdb # pdb.set_trace() temp_id_to_flag_version = {} for i in flag_versions_numbered.keys(): ifo = flag_versions_numbered[i]['ifos'] name = flag_versions_numbered[i]['name'] version = flag_versions_numbered[i]['version'] if (ifo, name, version) not in flag_versions.keys(): if new_comments == True: flag_versions[(ifo, name, version)] = InsertFlagVersion( ifo, name, version) else: flag_versions[(ifo, name, version)] = InsertFlagVersionOld( ifo, name, version) if new_comments: flag_versions[(ifo, name, version)].flag_description = str( flag_versions_numbered[i]['comment'] ) # old segment_definer comment = new flag_description # OUTDATED PLACEHOLDER: flag_versions[(ifo,name,version)].version_comment=str(flag_versions_numbered[i]['comment']) else: flag_versions[(ifo, name, version)].flag_comment = str( flag_versions_numbered[i]['comment']) flag_versions[(ifo, name, version)].version_comment = str( flag_versions_numbered[i]['comment']) flag_versions[( ifo, name, version)].temporary_definer_id = flag_versions_numbered[i][ 'segment_def_id'] flag_versions[( ifo, name, version )].temporary_process_id = flag_versions_numbered[i]['process_id'] # Populate reverse lookup dictionary: temp_id_to_flag_version[flag_versions[( ifo, name, version)].temporary_definer_id] = (ifo, name, version) # ways to solve the metadata problem: # Associate each insertion_metadata block with a process, then group # them and take the min insert_data_start and max insert_data_stop # parse segment_summary table and associate known segments with # flag_versions above: ## Note this next line is needed for looping over multiple files for i in flag_versions.keys(): flag_versions[i].temp_process_ids = {} for j in range(len(segment_md.table['segment_summary']['stream'])): #flag_versions_numbered[j] = {} seg_def_index = segment_md.table['segment_summary'][ 'orderedcol'].index('segment_def_id') #print "associated seg_def_id is: "+ segment_md.table['segment_summary']['stream'][j][seg_def_index] (ifo, name, version ) = temp_id_to_flag_version[segment_md.table['segment_summary'] ['stream'][j][seg_def_index]] seg_sum_index = segment_md.table['segment_summary'][ 'orderedcol'].index('segment_sum_id') # Unneeded: #flag_versions[(ifo,name,version)].temporary_segment_sum_id = segment_md.table['segment_summary']['stream'][j][seg_sum_index] start_time_index = segment_md.table['segment_summary'][ 'orderedcol'].index('start_time') end_time_index = segment_md.table['segment_summary'][ 'orderedcol'].index('end_time') start_time = segment_md.table['segment_summary']['stream'][j][ start_time_index] + offset end_time = segment_md.table['segment_summary']['stream'][j][ end_time_index] + offset comment_index = segment_md.table['segment_summary'][ 'orderedcol'].index('comment') seg_sum_comment = segment_md.table['segment_summary']['stream'][j][ comment_index] new_seg_summary = segments.segmentlist( [segments.segment(start_time, end_time)]) flag_versions[(ifo, name, version)].appendKnown(new_seg_summary) # Now I need to build up the insertion_metadata dictionary for this # summary: # Now I need to associate the right process with the known # segments here, and put the start and end time into the # insertion_metadata part of the # insert_history dict # Plan for processes and affected data: # Loop through segment summaries # If we haven't seen the associated process before, create it: # First, append the temp_process_id to temp_process_ids # Then, each temp_process_ids entry is a dictionary, where the one # element is start_affected time, and the other is end_affected # time, and later we will combine this with the correct # process_metadata dictionary process_id_index = segment_md.table['segment_summary'][ 'orderedcol'].index('process_id') temp_process_id = segment_md.table['segment_summary']['stream'][j][ process_id_index] if temp_process_id in flag_versions[( ifo, name, version)].temp_process_ids.keys(): # We don't need to append this process metadata, as it already # exists We do need to extend the affected data start and stop # to match if start_time < flag_versions[( ifo, name, version )].temp_process_ids[temp_process_id]['insert_data_start']: flag_versions[(ifo, name, version)].temp_process_ids[ temp_process_id]['insert_data_start'] = start_time if end_time > flag_versions[( ifo, name, version )].temp_process_ids[temp_process_id]['insert_data_stop']: flag_versions[(ifo, name, version)].temp_process_ids[ temp_process_id]['insert_data_stop'] = end_time else: # Need to make the dictionary entry for this process_id if seg_sum_comment != None: flag_versions[(ifo, name, version)].provenance_url = seg_sum_comment else: flag_versions[(ifo, name, version)].provenance_url = '' flag_versions[( ifo, name, version)].temp_process_ids[temp_process_id] = {} flag_versions[(ifo, name, version)].temp_process_ids[ temp_process_id]['insert_data_start'] = start_time flag_versions[(ifo, name, version)].temp_process_ids[ temp_process_id]['insert_data_stop'] = end_time # Now, I need to append an insert_history element to the flag_versions # for this ifo,name, version, as I have the correct insertion_metadata # and the correct # process_metadata (from the process_dict earlier if debug: t1 = time.time() for i in flag_versions.keys(): for pid in flag_versions[i].temp_process_ids.keys(): start = flag_versions[i].temp_process_ids[pid][ 'insert_data_start'] stop = flag_versions[i].temp_process_ids[pid][ 'insert_data_stop'] if new_comments: flag_versions[i].flag_version_comment = process_dict[pid][ 'process_comment'] insert_history_dict = {} try: insert_history_dict['process_metadata'] = process_dict[ pid]['process_metadata'] except: raise # import pdb # pdb.set_trace() insert_history_dict['insertion_metadata'] = {} insert_history_dict['insertion_metadata'][ 'insert_data_stop'] = stop insert_history_dict['insertion_metadata'][ 'insert_data_start'] = start ifo = flag_versions[i].ifo version = flag_versions[i].version name = flag_versions[i].name if use_new_insertion_metadata == True: insert_history_dict['insertion_metadata'][ 'comment'] = '/dq/' + '/'.join( [str(ifo), str(name), str(version)] ) # FIX make dq a constant string in case we ever change it else: insert_history_dict['insertion_metadata'][ 'uri'] = '/dq/' + '/'.join( [str(ifo), str(name), str(version)] ) # FIX make dq a constant string in case we ever change it #print ifo,name,version insert_history_dict['insertion_metadata'][ 'timestamp'] = _UTCToGPS(time.gmtime()) insert_history_dict['insertion_metadata'][ 'auth_user'] = process.get_username() #if hackDec11: # # note that this only uses one insert_history...despite # all that hard work to get the list right... # # so this might break something... # flag_versions[i].insert_history=insert_history_dict #else: # flag_versions[i].insert_history.append(insert_history_dict) flag_versions[i].insert_history.append(insert_history_dict) # parse segment table and associate known segments with flag_versions # above: try: for j in range(len(segment_md.table['segment']['stream'])): #flag_versions_numbered[j] = {} seg_def_index = segment_md.table['segment'][ 'orderedcol'].index('segment_def_id') #print "associated seg_def_id is: "+ # segment_md.table['segment']['stream'][j][seg_def_index] (ifo, name, version) = temp_id_to_flag_version[ segment_md.table['segment']['stream'][j][seg_def_index]] #seg_sum_index = segment_md.table['segment']['orderedcol'].index('segment_sum_id') start_time_index = segment_md.table['segment'][ 'orderedcol'].index('start_time') end_time_index = segment_md.table['segment'][ 'orderedcol'].index('end_time') start_time = segment_md.table['segment']['stream'][j][ start_time_index] + offset end_time = segment_md.table['segment']['stream'][j][ end_time_index] + offset new_seg = segments.segmentlist( [segments.segment(start_time, end_time)]) flag_versions[(ifo, name, version)].appendActive(new_seg) except KeyError: logger.info("No segment table for this file: %s" % filename) if debug: print "No segment table for this file: %s" % filename except: print "Unexpected error:", sys.exc_info()[0] raise for i in flag_versions.keys(): flag_versions[i].coalesceInsertHistory() if threads > 1: # Call this after the loop over files, and we should be good to go concurrent = min(threads, len(i)) # Fix!!! why did I do len(i) ??? q = Queue(concurrent * 2) # Fix!!! Improvement: remove hardcoded concurrency for i in range(concurrent): t = Thread(target=threadedPatchWithFailCases, args=[q, server, debug, logger]) t.daemon = True t.start() for i in flag_versions.values(): i.buildFlagDictFromInsertVersion() #i.flagDict url = i.buildURL(server) if debug: print url logger.debug("json.dumps(i.flagDict):") logger.debug("%s" % json.dumps(i.flagDict)) #if hackDec11: # if len(i.active)==0: # print "No segments for this url" # continue q.put(i) q.join() else: for i in flag_versions.values(): i.buildFlagDictFromInsertVersion() #i.flagDict url = i.buildURL(server) if debug: logger.debug("Url for the following data: %s" % url) #print url logger.debug("json.dumps(i.flagDict):") logger.debug("%s" % json.dumps(i.flagDict)) #if hackDec11: # if len(i.active)==0: # print "No segments for this url" # continue patchWithFailCases(i, url, debug, logger, testing_options) if debug: logger.debug( "If we made it this far, no errors were encountered in the inserts." ) #print "If we made it this far, no errors were encountered in the inserts." ### Fix!!! Improvement: Should be more careful about error handling here. if debug: t2 = time.time() logger.debug("Time elapsed for file %s = %d." % (filename, t2 - t1)) #print "Time elapsed for file %s = %d." % (filename,t2-t1) return True
RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser. import string try: #raise ImportError, "dummy error" simpleparse = 0 import pyRXPU def warnCB(s): print s pyRXP_parser = pyRXPU.Parser( ErrorOnValidityErrors=1, NoNoDTDWarning=1, ExpandCharacterEntities=1, ExpandGeneralEntities=1, warnCB=warnCB, srcName='string input', ReturnUTF8=1, ) def parsexml(xmlText, oneOutermostTag=0, eoCB=None, entityReplacer=None, parseOpts={}): pyRXP_parser.eoCB = eoCB p = pyRXP_parser.parse(xmlText, **parseOpts) return oneOutermostTag and p or ('', None, [p], None) except ImportError: simpleparse = 1
def getPyRXPParser(): import pyRXPU p = pyRXPU.Parser() return p