コード例 #1
0
 def test_compound(self):
     x = CompoundElement(["hello", "world"], id="42", foo="bar")
     x.foo = "baz"
     with self.assertRaises(AttributeError):
         x.y = "z"
     x.append(
         os.listdir)  # a non-serializable object (in this case a function)
     self.assertEqual(
         b'<compoundelement xmlns="http://www.w3.org/1999/xhtml" id="42">helloworld&lt;built-in function listdir&gt;</compoundelement>',
         etree.tostring(x.as_xhtml()))
     self.assertEqual(
         Body([Section([Paragraph(["Hello"]),
                        Paragraph(["World"])])]).as_plaintext(),
         "Hello World")
コード例 #2
0
    def test_serialize_roundtrip(self):
        # Create a elements object tree
        tree = Body([
            Section([Paragraph(["Hello"]),
                     Paragraph(["World"])],
                    ordinal="1",
                    title="Main section"),
            Section([
                42,
                date(2013, 11, 27),
                datetime(2013, 11, 27, 12, 0, 0), b'bytestring', {
                    'foo': 'bar',
                    'x': 'y'
                }
            ],
                    ordinal=2,
                    title="Native types")
        ])
        # roundtrip using the default XML format
        serialized = serialize(tree)
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, caller_globals=globals())
        self.assertEqual(tree, newtree)

        # make another section with special (but commonly used) types
        # and try to roundtrip them. The XML serialization format does
        # not support this.
        graph = Graph().parse(
            data="""@prefix dcterms: <http://purl.org/dc/terms/> .

<http://example.org/1> dcterms:title "Hello world"@en .
""",
            format="turtle")
        parseresult = urlparser.parseString("http://example.org/1")
        tree.append(Section([parseresult, graph], meta=graph))

        # roundtrip using JSON (which uses fully qualified classnames,
        # so we don't need to pass globals() into deserialize()
        serialized = serialize(tree, format="json")
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, format="json")

        # two pyparsing.ParseResult objects cannot be directly
        # compared (they don't implement __eq__), therefore we compare
        # their XML representations
        tree[2][0] = util.parseresults_as_xml(tree[2][0])
        newtree[2][0] = util.parseresults_as_xml(newtree[2][0])
        self.assertEqual(tree, newtree)
コード例 #3
0
    def test_parse_existing(self):
        # make sure parserecursive doesn't mess with existing structure.
        class MyHeader(UnicodeElement):
            pass

        doc = Body([
            MyHeader("My document"),
            Paragraph([
                "It's a very very fine document.",
                MyHeader("Subheading"), "And now we're done."
            ])
        ])
        want = serialize(doc)

        # first test a blank CitationParser, w/o patterns or formatter
        cp = CitationParser()

        doccopy = deepcopy(doc)
        cp.parse_recursive(doccopy)
        got = serialize(doccopy)
        self.assertEqual(want, got)

        cp = CitationParser(ferenda.citationpatterns.url)
        cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url)))
        doccopy = deepcopy(doc)
        cp.parse_recursive(doccopy)
        got = serialize(doccopy)
        self.assertEqual(want, got)
コード例 #4
0
    def test_parse_recursive(self):
        doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") + "/" +
                        Word(nums, exact=4).setResultsName("year")
                        ).setResultsName("DocRef")

        def doc_uri_formatter(parts):
            return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts

        doc = Body([
            Heading(["About Doc 43/2012 and it's interpretation"]),
            Paragraph([
                "According to Doc 43/2012",
                Footnote(["Available at http://example.org/xyz"]),
                " the bizbaz should be frobnicated"
            ])
        ])

        result = Body([
            Heading([
                "About ",
                LinkSubject("Doc 43/2012",
                            predicate="dcterms:references",
                            uri="http://example.org/docs/2012/43/"),
                " and it's interpretation"
            ]),
            Paragraph([
                "According to ",
                LinkSubject("Doc 43/2012",
                            predicate="dcterms:references",
                            uri="http://example.org/docs/2012/43/"),
                Footnote([
                    "Available at ",
                    LinkSubject("http://example.org/xyz",
                                predicate="dcterms:references",
                                uri="http://example.org/xyz")
                ]), " the bizbaz should be frobnicated"
            ])
        ])

        cp = CitationParser(ferenda.citationpatterns.url, doc_citation)
        cp.set_formatter(
            URIFormatter(("url", ferenda.uriformats.url),
                         ("DocRef", doc_uri_formatter)))
        doc = cp.parse_recursive(doc)
        self.maxDiff = 4096
        self.assertEqual(serialize(doc), serialize(result))
コード例 #5
0
ファイル: caselaw.py プロジェクト: zigit/ferenda
    def parse_document_from_soup(self, soup, doc):
        # Process text and create DOM
        self.parser = LegalRef(LegalRef.EGRATTSFALL)

        textdiv = soup.find("div", "texte")
        if textdiv:
            for node in textdiv.childGenerator():
                if node.string:
                    # Here we should start analyzing for things like
                    # "C-197/09". Note that the Eurlex data does not use
                    # the ordinary hyphen like above, but rather
                    # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle
                    # this to an ordinary hyphen.
                    subnodes = self.parser.parse(
                        node.string, predicate="dcterms:references")
                    doc.body.append(Paragraph(subnodes))
        else:
            self.log.warning("%s: No fulltext available!" % celexnum)
            doc.body.append(Paragraph(["(No fulltext available)"]))
コード例 #6
0
ファイル: jo.py プロジェクト: mavteam/ferenda
 def make_paragraph(parser):
     # A Paragraph containing PDFReader.Textelement object will
     # render these as <span> objects (the default rendering. A
     # PDFReader.Textbox object containing same will render
     # unstyled Textelements as plain strings, cutting down on
     # unneccesary <span> elements. However, these themselves
     # render with unneccessary @style and @class attributes,
     # which we don't want. For now, lets stick with Paragraphs
     # as containers and maybe later figure out how to get
     # PDFReader.Textelements to render themselves sanely.
     #
     # p = parser.reader.next()
     p = Paragraph(parser.reader.next())
     return p
コード例 #7
0
 def make_paragraph(parser):
     return Paragraph([parser.reader.next()])
コード例 #8
0
ファイル: jo.py プロジェクト: mavteam/ferenda
 def make_abstract(parser):
     a = Abstract([Paragraph(parser.reader.next())])
     return parser.make_children(a)
コード例 #9
0
ファイル: rfc.py プロジェクト: zigit/ferenda
 def make_paragraph(parser):
     chunk = p.reader.next()
     return Paragraph([" ".join(chunk.split())])
コード例 #10
0
ファイル: elementclasses.py プロジェクト: zigit/ferenda
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

# begin makedoc
from ferenda.elements import Body, Heading, Paragraph, Footnote

doc = Body([Heading(["About Doc 43/2012 and it's interpretation"],predicate="dcterms:title"),
            Paragraph(["According to Doc 43/2012",
                       Footnote(["Available at http://example.org/xyz"]),
                       " the bizbaz should be frobnicated"])
           ])
# end makedoc

# begin derived-class
from ferenda.elements import CompoundElement, OrdinalElement

class Preamble(CompoundElement): pass
class PreambleRecital(CompoundElement,OrdinalElement):
    tagname = "div"
    rdftype = "eurlex:PreambleRecital"

doc = Preamble([PreambleRecital("Un",ordinal=1)],
               [PreambleRecital("Deux",ordinal=2)],
               [PreambleRecital("Trois",ordinal=3)])
# end derived-class

# begin as-xhtml
from ferenda.elements import SectionalElement
p = SectionalElement(["Some content"],
                     ordinal = "1a",
                     identifier = "Doc pt 1(a)",