def test_compound(self): x = CompoundElement(["hello", "world"], id="42", foo="bar") x.foo = "baz" with self.assertRaises(AttributeError): x.y = "z" x.append( os.listdir) # a non-serializable object (in this case a function) self.assertEqual( b'<compoundelement xmlns="http://www.w3.org/1999/xhtml" id="42">helloworld<built-in function listdir></compoundelement>', etree.tostring(x.as_xhtml())) self.assertEqual( Body([Section([Paragraph(["Hello"]), Paragraph(["World"])])]).as_plaintext(), "Hello World")
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([ Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([ 42, date(2013, 11, 27), datetime(2013, 11, 27, 12, 0, 0), b'bytestring', { 'foo': 'bar', 'x': 'y' } ], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse( data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def test_parse_existing(self): # make sure parserecursive doesn't mess with existing structure. class MyHeader(UnicodeElement): pass doc = Body([ MyHeader("My document"), Paragraph([ "It's a very very fine document.", MyHeader("Subheading"), "And now we're done." ]) ]) want = serialize(doc) # first test a blank CitationParser, w/o patterns or formatter cp = CitationParser() doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got) cp = CitationParser(ferenda.citationpatterns.url) cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url))) doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got)
def test_parse_recursive(self): doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") + "/" + Word(nums, exact=4).setResultsName("year") ).setResultsName("DocRef") def doc_uri_formatter(parts): return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts doc = Body([ Heading(["About Doc 43/2012 and it's interpretation"]), Paragraph([ "According to Doc 43/2012", Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated" ]) ]) result = Body([ Heading([ "About ", LinkSubject("Doc 43/2012", predicate="dcterms:references", uri="http://example.org/docs/2012/43/"), " and it's interpretation" ]), Paragraph([ "According to ", LinkSubject("Doc 43/2012", predicate="dcterms:references", uri="http://example.org/docs/2012/43/"), Footnote([ "Available at ", LinkSubject("http://example.org/xyz", predicate="dcterms:references", uri="http://example.org/xyz") ]), " the bizbaz should be frobnicated" ]) ]) cp = CitationParser(ferenda.citationpatterns.url, doc_citation) cp.set_formatter( URIFormatter(("url", ferenda.uriformats.url), ("DocRef", doc_uri_formatter))) doc = cp.parse_recursive(doc) self.maxDiff = 4096 self.assertEqual(serialize(doc), serialize(result))
def parse_document_from_soup(self, soup, doc): # Process text and create DOM self.parser = LegalRef(LegalRef.EGRATTSFALL) textdiv = soup.find("div", "texte") if textdiv: for node in textdiv.childGenerator(): if node.string: # Here we should start analyzing for things like # "C-197/09". Note that the Eurlex data does not use # the ordinary hyphen like above, but rather # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle # this to an ordinary hyphen. subnodes = self.parser.parse( node.string, predicate="dcterms:references") doc.body.append(Paragraph(subnodes)) else: self.log.warning("%s: No fulltext available!" % celexnum) doc.body.append(Paragraph(["(No fulltext available)"]))
def make_paragraph(parser): # A Paragraph containing PDFReader.Textelement object will # render these as <span> objects (the default rendering. A # PDFReader.Textbox object containing same will render # unstyled Textelements as plain strings, cutting down on # unneccesary <span> elements. However, these themselves # render with unneccessary @style and @class attributes, # which we don't want. For now, lets stick with Paragraphs # as containers and maybe later figure out how to get # PDFReader.Textelements to render themselves sanely. # # p = parser.reader.next() p = Paragraph(parser.reader.next()) return p
def make_paragraph(parser): return Paragraph([parser.reader.next()])
def make_abstract(parser): a = Abstract([Paragraph(parser.reader.next())]) return parser.make_children(a)
def make_paragraph(parser): chunk = p.reader.next() return Paragraph([" ".join(chunk.split())])
# -*- coding: utf-8 -*- from __future__ import unicode_literals # begin makedoc from ferenda.elements import Body, Heading, Paragraph, Footnote doc = Body([Heading(["About Doc 43/2012 and it's interpretation"],predicate="dcterms:title"), Paragraph(["According to Doc 43/2012", Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated"]) ]) # end makedoc # begin derived-class from ferenda.elements import CompoundElement, OrdinalElement class Preamble(CompoundElement): pass class PreambleRecital(CompoundElement,OrdinalElement): tagname = "div" rdftype = "eurlex:PreambleRecital" doc = Preamble([PreambleRecital("Un",ordinal=1)], [PreambleRecital("Deux",ordinal=2)], [PreambleRecital("Trois",ordinal=3)]) # end derived-class # begin as-xhtml from ferenda.elements import SectionalElement p = SectionalElement(["Some content"], ordinal = "1a", identifier = "Doc pt 1(a)",