Example #1
0
    def test_parse_existing(self):
        # make sure parserecursive doesn't mess with existing structure.
        class MyHeader(UnicodeElement): pass
        

        doc = Body([MyHeader("My document"),
                    Paragraph([
                        "It's a very very fine document.",
                        MyHeader("Subheading"),
                        "And now we're done."
                        ])
                    ])
        want = serialize(doc)

        # first test a blank CitationParser, w/o patterns or formatter
        cp = CitationParser() 
        
        doccopy = deepcopy(doc)
        cp.parse_recursive(doccopy)
        got = serialize(doccopy)
        self.assertEqual(want, got)

        cp = CitationParser(ferenda.citationpatterns.url)
        cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url)))
        doccopy = deepcopy(doc)
        cp.parse_recursive(doccopy)
        got = serialize(doccopy)
        self.assertEqual(want, got)
Example #2
0
    def parametric_test(self, filename):
        p = SFS()
        p.id = '(test)'
        p.reader = TextReader(filename=filename, encoding='iso-8859-1',
                              linesep=TextReader.DOS)
        p.reader.autostrip = True
        # p.lagrum_parser = FakeParser()
        b = p.makeForfattning()
        elements = p._count_elements(b)
        if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2:
            # should be "skipfragments = ['A','K']", but this breaks test cases
            skipfragments = ['A', 'K']
        else:
            skipfragments = ['A']
        p._construct_ids(b, '', 'http://rinfo.lagrummet.se/publ/sfs/9999:999',
                         skipfragments)

        self._remove_uri_for_testcases(b)
        resultfilename = filename.replace(".txt", ".xml")
        self.maxDiff = 4096
        if os.path.exists(resultfilename):
            with codecs.open(resultfilename, encoding="utf-8") as fp:
                result = fp.read().strip()
            self.assertEqual(result, serialize(b).strip())
        else:
            self.assertEqual("", serialize(b).strip())
Example #3
0
    def test_parse_existing(self):
        # make sure parserecursive doesn't mess with existing structure.
        class MyHeader(UnicodeElement):
            pass

        doc = Body([
            MyHeader("My document"),
            Paragraph([
                "It's a very very fine document.",
                MyHeader("Subheading"), "And now we're done."
            ])
        ])
        want = serialize(doc)

        # first test a blank CitationParser, w/o patterns or formatter
        cp = CitationParser()

        doccopy = deepcopy(doc)
        cp.parse_recursive(doccopy)
        got = serialize(doccopy)
        self.assertEqual(want, got)

        cp = CitationParser(ferenda.citationpatterns.url)
        cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url)))
        doccopy = deepcopy(doc)
        cp.parse_recursive(doccopy)
        got = serialize(doccopy)
        self.assertEqual(want, got)
Example #4
0
    def test_add_different_types(self):
        box1 = Textbox([Textelement("hey", tag=None)],
                       fontid=None,
                       top=0,
                       left=0,
                       width=50,
                       height=10,
                       lines=1)
        box2 = Textbox([LinkedTextelement("1", tag="s", uri="foo.html")],
                       fontid=None,
                       top=0,
                       left=50,
                       width=5,
                       height=10,
                       lines=1)
        combinedbox = box1 + box2
        want = """
<Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="55" top="0" width="55">
  <Textelement>hey</Textelement>
  <LinkedTextelement tag="s" uri="foo.html">1</LinkedTextelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(combinedbox))
        # make sure __iadd__ performs like __add__
        box1 += box2
        self.assertEqual(want[1:], serialize(box1))
Example #5
0
    def test_multiple_textelements(self):
        pdf = self._parse_xml("""
<fontspec id="1" size="5" family="X" color="#00000"/>
<text top="0" left="0" width="23" height="13" font="1"><b>foo</b> <b>bar</b></text>
""")
        # test that the space between the two <b> tags doesn't get lost
        self.assertEqual("foo bar", str(pdf[0][0]))
        self.assertEqual('<Textelement tag="b">foo bar</Textelement>',
                         serialize(pdf[0][0][0] + pdf[0][0][1]).strip())
        want = """
<Textbox bottom="13" fontid="1" height="13" left="0" lineheight="0" lines="0" right="23" top="0" width="23">
  <Textelement tag="b">foo </Textelement>
  <Textelement tag="b">bar</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(pdf[0][0]))

        # 2nd test, with leading non-tagged Textelement
        pdf = self._parse_xml("""
<fontspec id="0" size="5" family="X" color="#00000"/>
<text top="374" left="508" width="211" height="14" font="0">näringsidkaren <i>en</i> <i>varning. En var-</i></text>
""")
        want = """
<Textbox bottom="388" fontid="0" height="14" left="508" lineheight="0" lines="0" right="719" top="374" width="211">
  <Textelement>näringsidkaren </Textelement>
  <Textelement tag="i">en </Textelement>
  <Textelement tag="i">varning. En var-</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(pdf[0][0]))
Example #6
0
    def test_elements_from_soup(self):
        from ferenda.elements import html
        soup = BeautifulSoup("""<body>
<h1>Sample</h1>
<div class="main">
<img src="xyz.png"/>
<p>Some <b>text</b></p>
<dl>
<dt>Term 1</dt>
<dd>Definition 1</dd>
</dl>
</div>
<div id="foot">
<hr/>
<a href="/">home</a> - <a href="/about">about</a>
</div>
</body>""", "lxml")
        body = html.elements_from_soup(soup.body)
        # print("Body: \n%s" % serialize(body))
        result = html.Body([html.H1(["Sample"]),
                            html.Div([html.Img(src="xyz.png"),
                                      html.P(["Some ",
                                              html.B(["text"])]),
                                      html.DL([html.DT(["Term 1"]),
                                               html.DD(["Definition 1"])])
                                  ], **{"class": "main"}),
                            html.Div([html.HR(),
                                      html.A(["home"], href="/"),
                                      " - ",
                                      html.A(["about"], href="/about")
                                  ], id="foot")])
        self.maxDiff = 4096
        self.assertEqual(serialize(body), serialize(result))
Example #7
0
    def test_addboxes(self):
        box1 = Textbox([Textelement("hey ", tag=None)],
                       fontid=None,
                       top=0,
                       left=0,
                       width=50,
                       height=10,
                       lines=1)
        box2 = Textbox([Textelement("ho", tag=None)],
                       fontid=None,
                       top=0,
                       left=50,
                       width=40,
                       height=10,
                       lines=1)

        combinedbox = box1 + box2
        want = """
<Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="90" top="0" width="90">
  <Textelement>hey ho</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(combinedbox))
        # make sure __iadd__ performs like __add__
        box1 += box2
        self.assertEqual(want[1:], serialize(box1))
Example #8
0

    def test_multiple_textelements(self):
        pdf = self._parse_xml("""
<fontspec id="1" size="5" family="X" color="#00000"/>
<text top="0" left="0" width="23" height="13" font="1"><b>foo</b> <b>bar</b></text>
""")
        # test that the space between the two <b> tags doesn't get lost
        self.assertEqual("foo bar", str(pdf[0][0]))
        self.assertEqual('<Textelement tag="b">foo bar</Textelement>',
                         serialize(pdf[0][0][0] + pdf[0][0][1]).strip())
        want = """
<Textbox bottom="13" fontid="1" height="13" left="0" lineheight="0" lines="0" right="23" top="0" width="23">
  <Textelement tag="b">foo </Textelement>
  <Textelement tag="b">bar</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(pdf[0][0]))

        # 2nd test, with leading non-tagged Textelement
        pdf = self._parse_xml("""
<fontspec id="0" size="5" family="X" color="#00000"/>
<text top="374" left="508" width="211" height="14" font="0">näringsidkaren <i>en</i> <i>varning. En var-</i></text>
""")
        want = """
<Textbox bottom="388" fontid="0" height="14" left="508" lineheight="0" lines="0" right="719" top="374" width="211">
  <Textelement>näringsidkaren </Textelement>
  <Textelement tag="i">en </Textelement>
  <Textelement tag="i">varning. En var-</Textelement>
</Textbox>
Example #9
0
 def parametric_test(self, filename):
     resultfilename = filename.replace(".txt", ".xml")
     debug = not os.path.exists(resultfilename)
     p, b = self.run_test_file(filename, debug)
     self.maxDiff = 4096
     if os.path.exists(resultfilename):
         with codecs.open(resultfilename, encoding="utf-8") as fp:
             result = fp.read().strip()
         # print(elements.serialize(b))
         if result != elements.serialize(b).strip():
             # re-run the parse but with debugging on
             print("============DEBUG OUTPUT================")
             p.debug = True
             tr = TextReader(filename,
                             encoding="utf-8",
                             linesep=TextReader.UNIX)
             b = p.parse(tr.getiterator(tr.readparagraph))
             print("===============RESULT===================")
             print(elements.serialize(b))
             self.fail("========See output above=======")
         else:
             self.assertEqual(result, elements.serialize(b).strip())
     else:
         print("\nResult:\n" + elements.serialize(b))
         self.fail()
Example #10
0
    def test_serialize_roundtrip(self):
        # Create a elements object tree
        tree = Body([
            Section([Paragraph(["Hello"]),
                     Paragraph(["World"])],
                    ordinal="1",
                    title="Main section"),
            Section([
                42,
                date(2013, 11, 27),
                datetime(2013, 11, 27, 12, 0, 0), b'bytestring', {
                    'foo': 'bar',
                    'x': 'y'
                }
            ],
                    ordinal=2,
                    title="Native types")
        ])
        # roundtrip using the default XML format
        serialized = serialize(tree)
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, caller_globals=globals())
        self.assertEqual(tree, newtree)

        # make another section with special (but commonly used) types
        # and try to roundtrip them. The XML serialization format does
        # not support this.
        graph = Graph().parse(
            data="""@prefix dcterms: <http://purl.org/dc/terms/> .

<http://example.org/1> dcterms:title "Hello world"@en .
""",
            format="turtle")
        parseresult = urlparser.parseString("http://example.org/1")
        tree.append(Section([parseresult, graph], meta=graph))

        # roundtrip using JSON (which uses fully qualified classnames,
        # so we don't need to pass globals() into deserialize()
        serialized = serialize(tree, format="json")
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, format="json")

        # two pyparsing.ParseResult objects cannot be directly
        # compared (they don't implement __eq__), therefore we compare
        # their XML representations
        tree[2][0] = util.parseresults_as_xml(tree[2][0])
        newtree[2][0] = util.parseresults_as_xml(newtree[2][0])
        self.assertEqual(tree, newtree)
Example #11
0
        

    def test_add_different_types(self):
        box1 = Textbox([Textelement("hey", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1)
        box2 = Textbox([LinkedTextelement("1", tag="s", uri="foo.html")], fontid=None, top=0, left=50, width=5, height=10, lines=1)
        combinedbox = box1 + box2
        want = """
<Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="55" top="0" width="55">
  <Textelement>hey</Textelement>
  <LinkedTextelement tag="s" uri="foo.html">1</LinkedTextelement>
</Textbox>
"""
        self.assertEqual(want[1:],
                         serialize(combinedbox))
        # make sure __iadd__ performs like __add__
        box1 += box2
Example #12
0
class Elements(unittest.TestCase):
    maxDiff = None
    def test_addboxes(self):
        box1 = Textbox([Textelement("hey ", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1)
        box2 = Textbox([Textelement("ho", tag=None)], fontid=None, top=0, left=50, width=40, height=10, lines=1)
        
        combinedbox = box1 + box2
        want = """
<Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="90" top="0" width="90">
  <Textelement>hey ho</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:],
                         serialize(combinedbox))
        # make sure __iadd__ performs like __add__
        box1 += box2
Example #13
0
                         serialize(pdf[0]))

    def test_after_footnote_tag(self):
        # minimized version of Prop 2011/12:60 p 147. It seems to be
        # the empty italized textelement, combined with the
        # after_footnote context, that caused a crash
        pdf = self._parse_xml("""
	<fontspec id="0" size="12" family="Times New Roman" color="#000000"/>
	<fontspec id="4" size="12" family="Times New Roman,Italic" color="#000000"/>
	<fontspec id="9" size="7" family="Times New Roman" color="#000000"/>
<text top="63" left="283" width="37" height="13" font="0">20 a §</text>
<text top="60" left="320" width="5" height="9" font="9">4</text>
<text top="442" left="304" width="4" height="13" font="4"><i> </i></text>
<text top="460" left="306" width="41" height="13" font="4"><i>20 b § </i></text>
""")
        # make sure that empty element is removed completely
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="76" fontid="0" height="16" left="283" lineheight="0" lines="0" right="325" top="60" width="42">
    <Textelement>20 a §</Textelement>
    <Textelement tag="sup">4</Textelement>
  </Textbox>
  <Textbox bottom="473" fontid="4" height="31" left="304" lineheight="0" lines="0" right="347" top="442" width="43">
    <Textelement tag="i">20 b § </Textelement>
  </Textbox>
</Page>
Example #14
0

    def test_italic_superscript_unreliable_font(self):
        # the thing here is that font 2 and font 7 really has the same
        # font family.
        # ferenda.sources.legal.se.decoders.OffsetDecoder1d knows this
        # since it's hard-coded. The main problem is that the
        # OffsetDecoder1d.fontspecs methods (that aliases the fonts)
        # is run after PDFReader._parse_xml. Maybe we need to make
        # ._parse_xml call into the given textdecoder for each
        # fontspec tag it encounters?
        from ferenda.sources.legal.se.decoders import OffsetDecoder1d
        pdf = self._parse_xml("""
<fontspec id="2" size="14" family="MAMMBB+TT5Eo00" color="#000000"/>
<fontspec id="7" size="7" family="MBAAAC+TTA1o00" color="#000000"/>
<text top="552" left="340" width="372" height="17" font="2">intressant om 50 år föreslås att projektet Kulturarw</text>
<text top="549" left="712" width="5" height="13" font="7"><i>3</i></text>
<text top="552" left="717" width="98" height="17" font="2"> får fortsätta </text>
        """, OffsetDecoder1d)
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="569" fontid="2" height="20" left="340" lineheight="0" lines="0" right="815" top="549" width="475">
    <Textelement>intressant om 50 år föreslås att projektet Kulturarw</Textelement>
    <Textelement tag="is">3</Textelement>
    <Textelement> får fortsätta </Textelement>
  </Textbox>
</Page>
"""
Example #15
0
    def parse_document_from_soup(self, soup, doc):
        # first run inherited version to get a doc.body tree that's
        # close to the actual HTML
        super(W3Standards, self).parse_document_from_soup(soup, doc)
        # then clean up doc.body best as you can with a FSMParser

        parser = self.get_parser()
        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        try:
            doc.body = parser.parse(doc.body)
        except:
            print("Exception")
            if parser.debug:
                import traceback
                (type, value, tb) = sys.exc_info()
                traceback.print_exception(type, value, tb)
            raise

        PreambleSection.counter = 0
        self.decorate_bodyparts(doc.body, doc.uri)

        if parser.debug:
            print(serialize(doc.body))
Example #16
0
    def test_after_footnote_tag(self):
        # minimized version of Prop 2011/12:60 p 147. It seems to be
        # the empty italized textelement, combined with the
        # after_footnote context, that caused a crash
        pdf = self._parse_xml("""
	<fontspec id="0" size="12" family="Times New Roman" color="#000000"/>
	<fontspec id="4" size="12" family="Times New Roman,Italic" color="#000000"/>
	<fontspec id="9" size="7" family="Times New Roman" color="#000000"/>
<text top="63" left="283" width="37" height="13" font="0">20 a §</text>
<text top="60" left="320" width="5" height="9" font="9">4</text>
<text top="442" left="304" width="4" height="13" font="4"><i> </i></text>
<text top="460" left="306" width="41" height="13" font="4"><i>20 b § </i></text>
""")
        # make sure that empty element is removed completely
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="76" fontid="0" height="16" left="283" lineheight="0" lines="0" right="325" top="60" width="42">
    <Textelement>20 a §</Textelement>
    <Textelement tag="sup">4</Textelement>
  </Textbox>
  <Textbox bottom="473" fontid="4" height="31" left="304" lineheight="0" lines="0" right="347" top="442" width="43">
    <Textelement tag="i">20 b § </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #17
0
    def test_space_insertion(self):
        # this is really more of a test of as_xhtml, but the starting point is the XML parse. The goal is to recreate the trailing, italicized, space in the second <text> element

        pdf = self._parse_xml("""
	<fontspec id="10" size="7" family="Times New Roman" color="#000000"/>
<text top="699" left="327" width="226" height="20" font="10"><i>Myndig-</i></text>
<text top="720" left="327" width="230" height="20" font="10"><i>heten ska </i>lämna<i> </i></text>
<text top="740" left="327" width="230" height="20" font="10"><i>enligt</i>  23 a §.</text>
""")
        combined_tb = pdf[0][0] + pdf[0][1] + pdf[0][2]
        # make sure that empty element is removed completely
        want = """
<Textbox bottom="760" fontid="10" height="61" left="327" lineheight="0" lines="0" right="557" top="699" width="230">
  <Textelement tag="i">Myndigheten ska </Textelement>
  <Textelement>lämna </Textelement>
  <Textelement tag="i">enligt</Textelement>
  <Textelement> 23 a §.</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(combined_tb))
        res = etree.tostring(combined_tb.as_xhtml(None),
                             encoding="utf-8",
                             pretty_print=True).decode("utf-8")
        res = re.sub("p xmlns[^>]*", "p", res)
        want = """<p><i>Myndigheten ska </i>lämna <i>enligt</i> 23 a §.</p>"""
        self.assertEqual(want, res.strip())
Example #18
0
    def test_italic_superscript_unreliable_font(self):
        # the thing here is that font 2 and font 7 really has the same
        # font family.
        # ferenda.sources.legal.se.decoders.OffsetDecoder1d knows this
        # since it's hard-coded. The main problem is that the
        # OffsetDecoder1d.fontspecs methods (that aliases the fonts)
        # is run after PDFReader._parse_xml. Maybe we need to make
        # ._parse_xml call into the given textdecoder for each
        # fontspec tag it encounters?
        from ferenda.sources.legal.se.decoders import OffsetDecoder1d
        pdf = self._parse_xml(
            """
<fontspec id="2" size="14" family="MAMMBB+TT5Eo00" color="#000000"/>
<fontspec id="7" size="7" family="MBAAAC+TTA1o00" color="#000000"/>
<text top="552" left="340" width="372" height="17" font="2">intressant om 50 år föreslås att projektet Kulturarw</text>
<text top="549" left="712" width="5" height="13" font="7"><i>3</i></text>
<text top="552" left="717" width="98" height="17" font="2"> får fortsätta </text>
        """, OffsetDecoder1d)
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="569" fontid="2" height="20" left="340" lineheight="0" lines="0" right="815" top="549" width="475">
    <Textelement>intressant om 50 år föreslås att projektet Kulturarw</Textelement>
    <Textelement tag="is">3</Textelement>
    <Textelement> får fortsätta </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #19
0
    def _test_parser(self, testfile, parser):
        encoding = 'iso-8859-1'
        with codecs.open(testfile,encoding=encoding) as fp:
            testdata = fp.read()

        parts = re.split('\r?\n\r?\n',testdata,1)
        if len(parts) == 1:
            want = ''
        else:
            (testdata, want) = parts
        want = want.replace("\r\n", "\n").strip()
        # p.currentlynamedlaws = {} # needed?
        test_paras = re.split('\r?\n---\r?\n',testdata)
        got_paras = []
        for para in test_paras:
            if para.startswith("RESET:"):
                parser.currentlynamedlaws.clear()
            if para.startswith("NOBASE:"):
                baseuri = None
            else:
                baseuri = 'http://rinfo.lagrummet.se/publ/sfs/9999:999'
            # print("Parsing %r" % para)
            nodes = parser.parse(para, baseuri)
            got_paras.append(serialize(nodes).strip())
        got = "\n---\n".join(got_paras).replace("\r\n","\n").strip()
        self.maxDiff = None
        self.assertEqual(want, got)
Example #20
0
    def test_parse_recursive(self):
        doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") + "/" +
                        Word(nums, exact=4).setResultsName("year")
                        ).setResultsName("DocRef")

        def doc_uri_formatter(parts):
            return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts

        doc = Body([
            Heading(["About Doc 43/2012 and it's interpretation"]),
            Paragraph([
                "According to Doc 43/2012",
                Footnote(["Available at http://example.org/xyz"]),
                " the bizbaz should be frobnicated"
            ])
        ])

        result = Body([
            Heading([
                "About ",
                LinkSubject("Doc 43/2012",
                            predicate="dcterms:references",
                            uri="http://example.org/docs/2012/43/"),
                " and it's interpretation"
            ]),
            Paragraph([
                "According to ",
                LinkSubject("Doc 43/2012",
                            predicate="dcterms:references",
                            uri="http://example.org/docs/2012/43/"),
                Footnote([
                    "Available at ",
                    LinkSubject("http://example.org/xyz",
                                predicate="dcterms:references",
                                uri="http://example.org/xyz")
                ]), " the bizbaz should be frobnicated"
            ])
        ])

        cp = CitationParser(ferenda.citationpatterns.url, doc_citation)
        cp.set_formatter(
            URIFormatter(("url", ferenda.uriformats.url),
                         ("DocRef", doc_uri_formatter)))
        doc = cp.parse_recursive(doc)
        self.maxDiff = 4096
        self.assertEqual(serialize(doc), serialize(result))
Example #21
0
    def test_serialize_roundtrip(self):
        # Create a elements object tree
        tree = Body([Section([Paragraph(["Hello"]),
                              Paragraph(["World"])],
                             ordinal="1",
                             title="Main section"),
                     Section([42,
                              date(2013,11,27),
                              datetime(2013,11,27,12,0,0),
                              b'bytestring',
                              {'foo': 'bar',
                               'x': 'y'}],
                             ordinal=2,
                             title="Native types")
                 ])
        # roundtrip using the default XML format
        serialized = serialize(tree)
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, caller_globals=globals())
        self.assertEqual(tree, newtree)

        # make another section with special (but commonly used) types
        # and try to roundtrip them. The XML serialization format does
        # not support this.
        graph = Graph().parse(data="""@prefix dcterms: <http://purl.org/dc/terms/> .

<http://example.org/1> dcterms:title "Hello world"@en .
""", format="turtle")
        parseresult = urlparser.parseString("http://example.org/1")
        tree.append(Section([parseresult,
                             graph],
                            meta=graph))
        
        # roundtrip using JSON (which uses fully qualified classnames,
        # so we don't need to pass globals() into deserialize()
        serialized = serialize(tree, format="json")
        self.assertIsInstance(serialized, str)
        newtree = deserialize(serialized, format="json")

        # two pyparsing.ParseResult objects cannot be directly
        # compared (they don't implement __eq__), therefore we compare
        # their XML representations
        tree[2][0] = util.parseresults_as_xml(tree[2][0])
        newtree[2][0] = util.parseresults_as_xml(newtree[2][0])
        self.assertEqual(tree, newtree)
Example #22
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.rdftype(self.ns['rfc'].RFC)
        desc.value(self.ns['dct'].title, title, lang="en")
        self.parse_header(header, desc)
        if not desc.getvalues(self.ns['dct'].identifier):
            desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
Example #23
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Example #24
0
def testparser(testcase, parser, filename):
    """Helper function to test :py:class:`~ferenda.FSMParser` based parsers."""
    wantfilename = filename.replace(".txt", ".xml")
    if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ:
        parser.debug = True

    tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
    b = parser.parse(tr.getiterator(tr.readparagraph))

    if 'FERENDA_FSMDEBUG' in os.environ:
        print(elements.serialize(b))
    testcase.maxDiff = 4096
    if os.path.exists(wantfilename):
        with codecs.open(wantfilename, encoding="utf-8") as fp:
            want = fp.read().strip()
        got = elements.serialize(b).strip()
        testcase.assertEqualXML(want, got)
    else:
        raise AssertionError("Want file not found. Result of parse:\n" +
                             elements.serialize(b))
Example #25
0
 def test_json_roundtrip(self):
     # a more realistic roundtrip example with some hairy parts
     from ferenda import PDFDocumentRepository, PDFReader
     d = PDFDocumentRepository()
     doc = d.make_document("sample")
     # make SURE that the intermediate files are newer than the pdf
     os.utime("test/files/pdfreader/intermediate/sample.xml", None)
     reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                        workdir="test/files/pdfreader/intermediate")
     d.parse_from_pdfreader(reader, doc)
     jsondoc = serialize(doc, format="json")
     newdoc = deserialize(jsondoc, format="json")
     self.assertEqual(doc, newdoc)
Example #26
0

    def test_ending_whitespace_tag(self):
        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4">
    <Textelement>Something </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:],
                         serialize(pdf[0]))

        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text>
<text top="706" left="148" width="4" height="18" font="3">Else</text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4">
    <Textelement>Something </Textelement>
  </Textbox>
  <Textbox bottom="724" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="706" width="4">
    <Textelement>Else</Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:],
                         serialize(pdf[0]))
        # concatenate the two boxes and make sure that an additional
        # space is produced
        glued = pdf[0][0] + pdf[0][1]
        res = etree.tostring(glued.as_xhtml(None)).decode()
        res = re.sub("p xmlns[^>]*", "p", res)
Example #27
0
 def test_json_roundtrip(self):
     # a more realistic roundtrip example with some hairy parts
     from ferenda import PDFDocumentRepository, PDFReader
     d = PDFDocumentRepository()
     doc = d.make_document("sample")
     # make SURE that the intermediate files are newer than the pdf
     os.utime("test/files/pdfreader/intermediate/sample.xml", None)
     reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                        workdir="test/files/pdfreader/intermediate")
     d.parse_from_pdfreader(reader, doc)
     jsondoc = serialize(doc, format="json")
     newdoc = deserialize(jsondoc, format="json")
     self.assertEqual(doc, newdoc)
Example #28
0
    def test_empty(self):
        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3">
  <b> </b>
</text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4" />
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #29
0
    def _test_parser(self, testfile, parser):
        # encoding = 'iso-8859-1'
        encoding = 'windows-1252'
        with codecs.open(testfile, encoding=encoding) as fp:
            testdata = fp.read()

        parts = re.split('\r?\n\r?\n', testdata, 1)
        if len(parts) == 1:
            want = ''
        else:
            (testdata, want) = parts
        want = want.replace("\r\n", "\n").strip()
        # p.currentlynamedlaws = {} # needed?
        test_paras = re.split('\r?\n---\r?\n', testdata)
        got_paras = []

        # we need to set up logging in some way as legalref.parse will
        # use logging facilities. For tests though, we should only log
        # at CRITICAL level.
        import logging
        r = logging.getLogger()
        if not r.handlers:
            h = logging.StreamHandler()
            h.setFormatter(
                logging.Formatter("%(name)s %(levelname)s %(message)s"))
            r.addHandler(h)
            r.setLevel(logging.CRITICAL)
        for para in test_paras:
            if para.startswith("RESET:"):
                parser.currentlynamedlaws.clear()
            elif para.startswith("NOBASE:"):
                baseuri_attributes = {}
            elif para.startswith("BASE:"):
                b = para.split("\n")[0].split(":", 1)[1]
                baseuri_attributes = ast.literal_eval(b)
                if 'type' in baseuri_attributes:
                    baseuri_attributes['type'] = URIRef(
                        baseuri_attributes['type'])
                para = para.split("\n", 1)[1]
                if 'kommittensbetankande' in baseuri_attributes:
                    parser.kommittensbetankande = baseuri_attributes[
                        'kommittensbetankande']
                    del baseuri_attributes['kommittensbetankande']
            else:
                baseuri_attributes = {'law': '9999:999'}
            nodes = parser.parse(para, self.minter, self.metadata,
                                 baseuri_attributes)
            got_paras.append(serialize(nodes).strip())
        got = "\n---\n".join(got_paras).replace("\r\n", "\n").strip()
        self.maxDiff = None
        self.assertEqual(want, got)
Example #30
0

    def test_empty(self):
        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3">
  <b> </b>
</text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4" />
</Page>
"""
Example #31
0
 def parametric_test(self, filename):
     resultfilename = filename.replace(".txt",".xml")
     debug = not os.path.exists(resultfilename)
     p, b = self.run_test_file(filename, debug)
     self.maxDiff = 4096
     if os.path.exists(resultfilename):
         with codecs.open(resultfilename,encoding="utf-8") as fp:
             result = fp.read().strip()
         # print(elements.serialize(b))
         if result != elements.serialize(b).strip():
             # re-run the parse but with debugging on
             print("============DEBUG OUTPUT================")
             p.debug = True
             tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
             b = p.parse(tr.getiterator(tr.readparagraph))
             print("===============RESULT===================")
             print(elements.serialize(b))
             self.fail("========See output above=======")
         else:
             self.assertEqual(result, elements.serialize(b).strip())
     else:
         print("\nResult:\n"+elements.serialize(b))
         self.fail()
Example #32
0
    def test_serialize_pyparsing(self):
        # these objects can't be roundtripped
        from ferenda.citationpatterns import url
        x = url.parseString("http://example.org/foo?param=val")
        serialized = serialize(Body([x]))
        self.assertEqual("""<Body>
  <url>
    <netloc>example.org</netloc>
    <path>/foo</path>
    <query>param=val</query>
    <scheme>http</scheme>
  </url>
</Body>
""", serialized)
Example #33
0
    def test_elements_from_soup(self):
        from ferenda.elements import html
        soup = BeautifulSoup(
            """<body>
<h1>Sample</h1>
<div class="main">
<img src="xyz.png"/>
<p>Some <b>text</b></p>
<dl>
<dt>Term 1</dt>
<dd>Definition 1</dd>
</dl>
</div>
<div id="foot">
<hr/>
<a href="/">home</a> - <a href="/about">about</a>
</div>
</body>""", "lxml")
        body = html.elements_from_soup(soup.body)
        # print("Body: \n%s" % serialize(body))
        result = html.Body([
            html.H1(["Sample"]),
            html.Div([
                html.Img(src="xyz.png"),
                html.P(["Some ", html.B(["text"])]),
                html.DL([html.DT(["Term 1"]),
                         html.DD(["Definition 1"])])
            ], **{"class": "main"}),
            html.Div([
                html.HR(),
                html.A(["home"], href="/"), " - ",
                html.A(["about"], href="/about")
            ],
                     id="foot")
        ])
        self.maxDiff = 4096
        self.assertEqual(serialize(body), serialize(result))
Example #34
0
    def test_ending_whitespace_tag(self):
        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4">
    <Textelement>Something </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))

        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text>
<text top="706" left="148" width="4" height="18" font="3">Else</text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4">
    <Textelement>Something </Textelement>
  </Textbox>
  <Textbox bottom="724" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="706" width="4">
    <Textelement>Else</Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
        # concatenate the two boxes and make sure that an additional
        # space is produced
        glued = pdf[0][0] + pdf[0][1]
        res = etree.tostring(glued.as_xhtml(None)).decode()
        res = re.sub("p xmlns[^>]*", "p", res)
        want = "<p>Something Else</p>"
        self.assertEqual(want, res)
Example #35
0
    def parametric_test(self, filename):
        self.maxDiff = None
        reader = TextReader(filename=filename, encoding='iso-8859-1',
                              linesep=TextReader.DOS)
        reader.autostrip = True
        # p.lagrum_parser = FakeParser()
        parser = self.p.get_parser("9999:998", reader)
        b = parser(reader)
        elements = self.p._count_elements(b)

        # FIXME: How was this used? Where should we plug
        # skipfragments?
        if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2:
            self.p.skipfragments = [
                ('rinfoex:avdelningnummer', 'rpubl:kapitelnummer'),
                ('rpubl:kapitelnummer', 'rpubl:paragrafnummer')]
        else:
            self.p.skipfragments = [('rinfoex:avdelningnummer',
                                     'rpubl:kapitelnummer')]

        # NB: _construct_ids won't look for references
        self.p.visit_node(b, self.p.construct_id, {'basefile': '9999:998',
                                                   'uris': set()})
        self.p.visit_node(b, self.p.find_definitions, False, debug=False)
        self.p.lagrum_parser.parse_recursive(b)
        self._remove_uri_for_testcases(b)
        resultfilename = filename.replace(".txt", ".xml")
        if os.path.exists(resultfilename):
            with codecs.open(resultfilename, encoding="utf-8") as fp:
                result = fp.read().strip()
            self.assertEqual(result, serialize(b).strip())
        else:
            self.assertEqual("", serialize(b).strip())
        # reset the state of the repo...
        self.p.current_section = '0'
        self.p.current_headline_level = 0
Example #36
0
    def test_serialize_pyparsing(self):
        # these objects can't be roundtripped
        from ferenda.citationpatterns import url
        x = url.parseString("http://example.org/foo?param=val")
        serialized = serialize(Body([x]))
        self.assertEqual(
            """<Body>
  <url>
    <netloc>example.org</netloc>
    <path>/foo</path>
    <query>param=val</query>
    <scheme>http</scheme>
  </url>
</Body>
""", serialized)
Example #37
0
    def test_middle_whitespace_tag(self):
        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3"><b>Verksamhetsregion<i> </i></b><b>Lund </b></text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4">
    <Textelement tag="b">Verksamhetsregion </Textelement>
    <Textelement tag="b">Lund </Textelement>
  </Textbox>
</Page>
"""
        # res = etree.tostring(pdf.as_xhtml(None)).decode()
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #38
0
    def test_parse_recursive(self):
        doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") 
                        + "/" + 
                        Word(nums,exact=4).setResultsName("year")).setResultsName("DocRef")

        def doc_uri_formatter(parts):
            return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts


        doc = Body([Heading(["About Doc 43/2012 and it's interpretation"]),
                    Paragraph(["According to Doc 43/2012",
                               Footnote(["Available at http://example.org/xyz"]),
                               " the bizbaz should be frobnicated"])
                    ])

        result = Body([Heading(["About ",
                                LinkSubject("Doc 43/2012", predicate="dct:references",
                                           uri="http://example.org/docs/2012/43/"),
                                " and it's interpretation"]),
                       Paragraph(["According to ",
                                  LinkSubject("Doc 43/2012", predicate="dct:references",
                                              uri="http://example.org/docs/2012/43/"),
                                  Footnote(["Available at ",
                                            LinkSubject("http://example.org/xyz", 
                                                        predicate="dct:references",
                                                        uri="http://example.org/xyz")
                                            ]),
                                  " the bizbaz should be frobnicated"])
                       ])
        
        cp = CitationParser(ferenda.citationpatterns.url, doc_citation)
        cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url),
                                      ("DocRef", doc_uri_formatter)))
        doc = cp.parse_recursive(doc)
        self.maxDiff = 4096
        self.assertEqual(serialize(doc),serialize(result))
Example #39
0
        self.assertEqual(want,res)

    def test_middle_whitespace_tag(self):
        pdf = self._parse_xml("""
<fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="686" left="148" width="4" height="18" font="3"><b>Verksamhetsregion<i> </i></b><b>Lund </b></text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4">
    <Textelement tag="b">Verksamhetsregion </Textelement>
    <Textelement tag="b">Lund </Textelement>
  </Textbox>
</Page>
"""
        # res = etree.tostring(pdf.as_xhtml(None)).decode()
Example #40
0
    def test_footnote(self):
        pdf = self._parse_xml("""
<fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<text top="830" left="85" width="241" height="20" font="7">bindande verkan för det allmänna.</text>
<text top="829" left="327" width="5" height="12" font="15">7</text>
<text top="830" left="332" width="227" height="20" font="7">Bestämmelsen kan således inte </text>"""
                              )
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="850" fontid="7" height="21" left="85" lineheight="0" lines="0" right="559" top="829" width="474">
    <Textelement>bindande verkan för det allmänna.</Textelement>
    <Textelement tag="sup">7</Textelement>
    <Textelement>Bestämmelsen kan således inte </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #41
0
        
    def test_footnote(self):
        pdf = self._parse_xml("""
<fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<text top="830" left="85" width="241" height="20" font="7">bindande verkan för det allmänna.</text>
<text top="829" left="327" width="5" height="12" font="15">7</text>
<text top="830" left="332" width="227" height="20" font="7">Bestämmelsen kan således inte </text>""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="850" fontid="7" height="21" left="85" lineheight="0" lines="0" right="559" top="829" width="474">
    <Textelement>bindande verkan för det allmänna.</Textelement>
    <Textelement tag="sup">7</Textelement>
    <Textelement>Bestämmelsen kan således inte </Textelement>
  </Textbox>
</Page>
"""
Example #42
0
 def test_serialize_roundtrip(self):
     # Create a elements object tree
     tree = Body([Section([Paragraph(["Hello"]),
                           Paragraph(["World"])],
                          ordinal="1",
                          title="Main section"),
                  Section([42,
                           date(2013,11,27),
                           b'bytestring',
                           {'foo': 'bar',
                            'x': 'y'}],
                          ordinal=2,
                          title="Native types")
              ])
     serialized = serialize(tree)
     self.assertIsInstance(serialized, str)
     newtree = deserialize(serialized, globals())
     self.assertEqual(tree, newtree)
Example #43
0
    def test_linked_footnote(self):
        pdf = self._parse_xml("""
<fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<text top="830" left="85" width="241" height="20" font="7">bindande verkan för det allmänna.</text>
<text top="829" left="327" width="5" height="12" font="15"><a href="unik-kunskap-genom-registerforskning-sou-201445.html#120">7</a></text>
<text top="830" left="332" width="227" height="20" font="7"><a href="unik-kunskap-genom-registerforskning-sou-201445.html#120"> </a>Bestämmelsen kan således inte </text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="850" fontid="7" height="21" left="85" lines="-2" right="559" top="829" width="474">
    <Textelement>bindande verkan för det allmänna.</Textelement>
    <LinkedTextelement tag="s" uri="unik-kunskap-genom-registerforskning-sou-201445.html#120">7</LinkedTextelement>
    <LinkedTextelement uri="unik-kunskap-genom-registerforskning-sou-201445.html#120"> </LinkedTextelement>
    <Textelement>Bestämmelsen kan således inte </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #44
0
    def test_comment(self):
        pdf = self._parse_xml("""
<fontspec id="1" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="270" left="278" width="450" height="12" font="1">First line</text>
<!-- comments like this won't appear in real pdf2xml output, but might appear
     in test cases -->
<text top="290" left="278" width="450" height="12" font="1">Second line</text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="282" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="270" width="450">
    <Textelement>First line</Textelement>
  </Textbox>
  <Textbox bottom="302" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="290" width="450">
    <Textelement>Second line</Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #45
0

    def test_comment(self):
        pdf = self._parse_xml("""
<fontspec id="1" size="11" family="TimesNewRomanPS" color="#000000"/>
<text top="270" left="278" width="450" height="12" font="1">First line</text>
<!-- comments like this won't appear in real pdf2xml output, but might appear
     in test cases -->
<text top="290" left="278" width="450" height="12" font="1">Second line</text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="282" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="270" width="450">
    <Textelement>First line</Textelement>
  </Textbox>
  <Textbox bottom="302" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="290" width="450">
    <Textelement>Second line</Textelement>
  </Textbox>
</Page>
"""
Example #46
0
    def test_footnote_footer(self):
        pdf = self._parse_xml("""
<fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="16" size="10" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="17" size="5" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<text top="849" left="85" width="472" height="20" font="7">ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </text>
<text top="891" left="85" width="4" height="9" font="17">7</text>
<text top="891" left="89" width="258" height="15" font="16"> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="869" fontid="7" height="20" left="85" lineheight="0" lines="0" right="557" top="849" width="472">
    <Textelement>ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </Textelement>
  </Textbox>
  <Textbox bottom="906" fontid="16" height="15" left="85" lineheight="0" lines="0" right="347" top="891" width="262">
    <Textelement tag="sup">7</Textelement>
    <Textelement> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #47
0
        

    def test_footnote_footer(self):
        pdf = self._parse_xml("""
<fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="16" size="10" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<fontspec id="17" size="5" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
<text top="849" left="85" width="472" height="20" font="7">ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </text>
<text top="891" left="85" width="4" height="9" font="17">7</text>
<text top="891" left="89" width="258" height="15" font="16"> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="869" fontid="7" height="20" left="85" lineheight="0" lines="0" right="557" top="849" width="472">
    <Textelement>ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </Textelement>
  </Textbox>
  <Textbox bottom="906" fontid="16" height="15" left="85" lineheight="0" lines="0" right="347" top="891" width="262">
    <Textelement tag="sup">7</Textelement>
    <Textelement> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </Textelement>
  </Textbox>
</Page>
"""
Example #48
0
    def test_footnote_lineending(self):
        pdf = self._parse_xml("""
<fontspec id="0" size="13" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/>
<fontspec id="4" size="13" family="GGKKID+TimesNewRomanPS-ItalicMT" color="#000000"/>
<fontspec id="7" size="7" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/>
<text top="161" left="291" width="401" height="17" font="0">Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</text>
<text top="159" left="692" width="5" height="11" font="7">7</text>
<text top="161" left="697" width="4" height="17" font="0"> </text>
<text top="178" left="291" width="249" height="17" font="4"><i>dels</i> att 1 kap. 12 § ska upphöra att gälla, </text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="178" fontid="0" height="19" left="291" lineheight="0" lines="0" right="697" top="159" width="406">
    <Textelement>Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</Textelement>
    <Textelement tag="sup">7</Textelement>
  </Textbox>
  <Textbox bottom="195" fontid="4" height="17" left="291" lineheight="0" lines="0" right="540" top="178" width="249">
    <Textelement tag="i">dels</Textelement>
    <Textelement> att 1 kap. 12 § ska upphöra att gälla, </Textelement>
  </Textbox>
</Page>
"""
        self.assertEqual(want[1:], serialize(pdf[0]))
Example #49
0
        self.assertEqual(want[1:], serialize(pdf[0]))

    def test_space_insertion(self):
        # this is really more of a test of as_xhtml, but the starting point is the XML parse. The goal is to recreate the trailing, italicized, space in the second <text> element
        
        pdf = self._parse_xml("""
	<fontspec id="10" size="7" family="Times New Roman" color="#000000"/>
<text top="699" left="327" width="226" height="20" font="10"><i>Myndig-</i></text>
<text top="720" left="327" width="230" height="20" font="10"><i>heten ska </i>lämna<i> </i></text>
<text top="740" left="327" width="230" height="20" font="10"><i>enligt</i>  23 a §.</text>
""")
        combined_tb = pdf[0][0] + pdf[0][1] + pdf[0][2]
        # make sure that empty element is removed completely
        want = """
<Textbox bottom="760" fontid="10" height="61" left="327" lineheight="0" lines="0" right="557" top="699" width="230">
  <Textelement tag="i">Myndigheten ska </Textelement>
  <Textelement>lämna </Textelement>
  <Textelement tag="i">enligt</Textelement>
  <Textelement> 23 a §.</Textelement>
</Textbox>
"""
        self.assertEqual(want[1:], serialize(combined_tb))
        res = etree.tostring(combined_tb.as_xhtml(None), encoding="utf-8", pretty_print=True).decode("utf-8")
        res = re.sub("p xmlns[^>]*", "p", res)
Example #50
0
    def fsmparse(self, functionname, source):
        """Parse a list of text chunks using a named fsm parser and
        output the parse tree and final result to stdout.

        :param functionname: A function that returns a configured
                             :py:class:`~ferenda.FSMParser`
        :type  functionname: str
        :param source:       A file containing the text chunks, separated
                             by double newlines
        :type source:        str

        """
        modulename, classname, methodname = functionname.rsplit(".", 2)
        __import__(modulename)
        m = sys.modules[modulename]
        for name, cls in inspect.getmembers(m, inspect.isclass):
            if name == classname:
                break
        method = getattr(cls,methodname)
        parser = method()
        parser.debug = True
        tr = TextReader(source)
        b = parser.parse(tr.getiterator(tr.readparagraph))
        print(serialize(b))
Example #51
0

    def test_footnote_lineending(self):
        pdf = self._parse_xml("""
<fontspec id="0" size="13" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/>
<fontspec id="4" size="13" family="GGKKID+TimesNewRomanPS-ItalicMT" color="#000000"/>
<fontspec id="7" size="7" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/>
<text top="161" left="291" width="401" height="17" font="0">Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</text>
<text top="159" left="692" width="5" height="11" font="7">7</text>
<text top="161" left="697" width="4" height="17" font="0"> </text>
<text top="178" left="291" width="249" height="17" font="4"><i>dels</i> att 1 kap. 12 § ska upphöra att gälla, </text>
""")
        want = """
<Page height="750" number="1" width="500">
  <Textbox bottom="178" fontid="0" height="19" left="291" lineheight="0" lines="0" right="697" top="159" width="406">
    <Textelement>Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</Textelement>
    <Textelement tag="sup">7</Textelement>
  </Textbox>
  <Textbox bottom="195" fontid="4" height="17" left="291" lineheight="0" lines="0" right="540" top="178" width="249">
    <Textelement tag="i">dels</Textelement>
    <Textelement> att 1 kap. 12 § ska upphöra att gälla, </Textelement>
  </Textbox>
</Page>
"""
Example #52
0
File: rfc.py Project: zigit/ferenda
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True
Example #53
0
 def test_serialize_newstr(self):
     # really a test for future.types.newstr.newstr, here aliased
     # to str() -- this is only ever an issue on py2.
     tree = Body([], a=str("x"), b="y")
     serialized = serialize(tree, format="xml")
     self.assertEqual('<Body a="x" b="y" />\n', serialized)
Example #54
0
but by the catch-all is_paragraph. The recognizers are run in the
order specified by FSMParser.set_transitions().

    This is a preformatted section.
        It could be used for source code,
    +-------------------+
    |   line drawings   |
    +-------------------+
        or what have                 you.

Second section
==============

The above new section implicitly closed the first section which we
were in. This was made explicit by the last transition rule, which
stated that any time a section is encountered while in the "section"
state, we should not create any more children (False) but instead
return to our previous state (which in this case is "body", but for a
more complex language could be any number of states)."""

p = FSMParser()
p.set_recognizers(is_section, is_preformatted, is_paragraph)
p.set_transitions(transitions)
p.initial_constructor = make_body
p.initial_state = "body"
body = p.parse(text.split("\n\n"))
# print(elements.serialize(body))

# end main
return_value = elements.serialize(body)
Example #55
0
but by the catch-all is_paragraph. The recognizers are run in the
order specified by FSMParser.set_transitions().

    This is a preformatted section.
        It could be used for source code,
    +-------------------+
    |   line drawings   |
    +-------------------+
        or what have                 you.

Second section
==============

The above new section implicitly closed the first section which we
were in. This was made explicit by the last transition rule, which
stated that any time a section is encountered while in the "section"
state, we should not create any more children (False) but instead
return to our previous state (which in this case is "body", but for a
more complex language could be any number of states)."""

p = FSMParser()
p.set_recognizers(is_section, is_preformatted, is_paragraph)
p.set_transitions(transitions)
p.initial_constructor = make_body
p.initial_state = "body"
body = p.parse(text.split("\n\n"))
# print(elements.serialize(body))

# end main
return_value = elements.serialize(body)
Example #56
0
 def test_serialize_newstr(self):
     # really a test for future.types.newstr.newstr, here aliased
     # to str() -- this is only ever an issue on py2.
     tree = Body([], a=str("x"), b="y")
     serialized = serialize(tree, format="xml")
     self.assertEqual('<Body a="x" b="y" />\n', serialized)
Example #57
0
    def wrapper(self, doc):
        # call the actual function that creates the doc data
        oldbasefile = doc.basefile
        ret = f(self, doc)
        if doc.basefile != oldbasefile:
            # means that basefile was adjusted.  Touch the old parsed
            # path first so we don't regenerate.
            with self.store.open_parsed(oldbasefile, "w"):
                pass
            # move any intermediate files (in particular extracted
            # image backgrounds from PDF files) that might be
            # needed later. 
            old_intermediate = self.store.intermediate_path(oldbasefile)
            new_intermediate = self.store.intermediate_path(doc.basefile)
            if self.store.storage_policy == "dir":
                old_intermediate = os.path.dirname(old_intermediate)
                new_intermediate = os.path.dirname(new_intermediate)
            if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate):
                util.ensure_dir(new_intermediate)
                os.rename(old_intermediate, new_intermediate)
        # now render thath doc data as files (JSON, XHTML, RDF/XML)
        if self.config.serializejson == True:
            with self.store.open_serialized(doc.basefile, "wb") as fp:
                r = serialize(doc, format="json")  # should be a (unicode) str
                fp.write(r.encode('utf-8'))
            self.log.debug(
                "Created %s" %
                (self.store.serialized_path(
                    doc.basefile)))
        # css file + background images + png renderings of text
        resources = self.create_external_resources(doc)
        if resources:
            cssuris = [cssuri(doc.uri, x) for x in resources if x.endswith(".css")]
        else:
            cssuris = []
        if cssuris:
            doc.cssuris = cssuris
        updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile, version=doc.version))
        if updated:
            self.log.debug(
                "Created %s" %
                (self.store.parsed_path(
                    doc.basefile)))


        # Extract all triples on the XHTML/RDFa data to a separate
        # RDF/XML file
        distilled_graph = Graph()
        with codecs.open(self.store.parsed_path(doc.basefile, version=doc.version),
                         encoding="utf-8") as fp:  # unicode
            distilled_graph.parse(data=fp.read(), format="rdfa",
                                  publicID=doc.uri)

        # The act of parsing from RDFa binds a lot of namespaces
        # in the graph in an unneccesary manner. Particularly it
        # binds both 'dc' and 'dcterms' to
        # 'http://purl.org/dc/terms/', which makes serialization
        # less than predictable. Blow these prefixes away.
        distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
        distilled_graph.bind(
            "dcterms",
            URIRef("http://example.org/this-prefix-should-not-be-used"))

        util.ensure_dir(self.store.distilled_path(doc.basefile, version=doc.version))
        with open(self.store.distilled_path(doc.basefile, version=doc.version),
                  "wb") as distilled_file:
            # print("============distilled===============")
            # print(distilled_graph.serialize(format="turtle").decode('utf-8'))
            distilled_graph.serialize(distilled_file, format="pretty-xml")
        self.log.debug(
            '%s triples extracted to %s',
            len(distilled_graph), self.store.distilled_path(doc.basefile, version=doc.version))

        # Validate that all required triples are present (we check
        # distilled_graph, but we could just as well check doc.meta)
        required = sorted(set(self.get_required_predicates(doc))) 
        for p in required:
            x = distilled_graph.value(URIRef(doc.uri), p)
            if not x:
                self.log.warning("Metadata is missing a %s triple" %
                                 (distilled_graph.qname(p)))
        if 'validaterdfa' in self.config and self.config.validaterdfa:
            # Validate that all triples specified in doc.meta and any
            # .meta property on any body object is present in the
            # XHTML+RDFa file.  NOTE: graph_diff has suddenly become
            # glacial on medium-large graphs (> ~1000 triples). Maybe we
            # don't have to validate them?
            huge_graph = False
            for g in iterate_graphs(doc.body):
                doc.meta += g
                if len(doc.meta) > 1000:
                    huge_graph = True
                    break
            if huge_graph:
                self.log.warning("Graph seems huge, skipping validation")
            else:
                # self.log.debug("diffing graphs")
                (in_both, in_first, in_second) = graph_diff(doc.meta, distilled_graph)
                self.log.debug("graphs diffed (-%s, +%s)" % (len(in_first), len(in_second)))

                if in_first:  # original metadata not present in the XHTML filee
                    self.log.warning("%d triple(s) from the original metadata was "
                                     "not found in the serialized XHTML file:\n%s",
                                     len(in_first), in_first.serialize(format="n3").decode("utf-8"))

        # Validate that entry.title and entry.id has been filled
        # (might be from doc.meta and doc.uri, might be other things
        entry = DocumentEntry(self.store.documententry_path(doc.basefile, version=doc.version))
        if not entry.id:
            self.log.warning("entry.id missing")
        if not entry.title:
            self.log.warning("entry.title missing")
        return ret
Example #58
0
 def do_test(self, keywords, want):
     repo = LNKeyword()
     body = repo.toc_generate_page_body(map(self.makeitem, keywords), None)
     got = serialize(body[1])
     self.assertEqual(want, got)