def test_parse_existing(self): # make sure parserecursive doesn't mess with existing structure. class MyHeader(UnicodeElement): pass doc = Body([MyHeader("My document"), Paragraph([ "It's a very very fine document.", MyHeader("Subheading"), "And now we're done." ]) ]) want = serialize(doc) # first test a blank CitationParser, w/o patterns or formatter cp = CitationParser() doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got) cp = CitationParser(ferenda.citationpatterns.url) cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url))) doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got)
def parametric_test(self, filename): p = SFS() p.id = '(test)' p.reader = TextReader(filename=filename, encoding='iso-8859-1', linesep=TextReader.DOS) p.reader.autostrip = True # p.lagrum_parser = FakeParser() b = p.makeForfattning() elements = p._count_elements(b) if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2: # should be "skipfragments = ['A','K']", but this breaks test cases skipfragments = ['A', 'K'] else: skipfragments = ['A'] p._construct_ids(b, '', 'http://rinfo.lagrummet.se/publ/sfs/9999:999', skipfragments) self._remove_uri_for_testcases(b) resultfilename = filename.replace(".txt", ".xml") self.maxDiff = 4096 if os.path.exists(resultfilename): with codecs.open(resultfilename, encoding="utf-8") as fp: result = fp.read().strip() self.assertEqual(result, serialize(b).strip()) else: self.assertEqual("", serialize(b).strip())
def test_parse_existing(self): # make sure parserecursive doesn't mess with existing structure. class MyHeader(UnicodeElement): pass doc = Body([ MyHeader("My document"), Paragraph([ "It's a very very fine document.", MyHeader("Subheading"), "And now we're done." ]) ]) want = serialize(doc) # first test a blank CitationParser, w/o patterns or formatter cp = CitationParser() doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got) cp = CitationParser(ferenda.citationpatterns.url) cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url))) doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got)
def test_add_different_types(self): box1 = Textbox([Textelement("hey", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1) box2 = Textbox([LinkedTextelement("1", tag="s", uri="foo.html")], fontid=None, top=0, left=50, width=5, height=10, lines=1) combinedbox = box1 + box2 want = """ <Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="55" top="0" width="55"> <Textelement>hey</Textelement> <LinkedTextelement tag="s" uri="foo.html">1</LinkedTextelement> </Textbox> """ self.assertEqual(want[1:], serialize(combinedbox)) # make sure __iadd__ performs like __add__ box1 += box2 self.assertEqual(want[1:], serialize(box1))
def test_multiple_textelements(self): pdf = self._parse_xml(""" <fontspec id="1" size="5" family="X" color="#00000"/> <text top="0" left="0" width="23" height="13" font="1"><b>foo</b> <b>bar</b></text> """) # test that the space between the two <b> tags doesn't get lost self.assertEqual("foo bar", str(pdf[0][0])) self.assertEqual('<Textelement tag="b">foo bar</Textelement>', serialize(pdf[0][0][0] + pdf[0][0][1]).strip()) want = """ <Textbox bottom="13" fontid="1" height="13" left="0" lineheight="0" lines="0" right="23" top="0" width="23"> <Textelement tag="b">foo </Textelement> <Textelement tag="b">bar</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(pdf[0][0])) # 2nd test, with leading non-tagged Textelement pdf = self._parse_xml(""" <fontspec id="0" size="5" family="X" color="#00000"/> <text top="374" left="508" width="211" height="14" font="0">näringsidkaren <i>en</i> <i>varning. En var-</i></text> """) want = """ <Textbox bottom="388" fontid="0" height="14" left="508" lineheight="0" lines="0" right="719" top="374" width="211"> <Textelement>näringsidkaren </Textelement> <Textelement tag="i">en </Textelement> <Textelement tag="i">varning. En var-</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(pdf[0][0]))
def test_elements_from_soup(self): from ferenda.elements import html soup = BeautifulSoup("""<body> <h1>Sample</h1> <div class="main"> <img src="xyz.png"/> <p>Some <b>text</b></p> <dl> <dt>Term 1</dt> <dd>Definition 1</dd> </dl> </div> <div id="foot"> <hr/> <a href="/">home</a> - <a href="/about">about</a> </div> </body>""", "lxml") body = html.elements_from_soup(soup.body) # print("Body: \n%s" % serialize(body)) result = html.Body([html.H1(["Sample"]), html.Div([html.Img(src="xyz.png"), html.P(["Some ", html.B(["text"])]), html.DL([html.DT(["Term 1"]), html.DD(["Definition 1"])]) ], **{"class": "main"}), html.Div([html.HR(), html.A(["home"], href="/"), " - ", html.A(["about"], href="/about") ], id="foot")]) self.maxDiff = 4096 self.assertEqual(serialize(body), serialize(result))
def test_addboxes(self): box1 = Textbox([Textelement("hey ", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1) box2 = Textbox([Textelement("ho", tag=None)], fontid=None, top=0, left=50, width=40, height=10, lines=1) combinedbox = box1 + box2 want = """ <Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="90" top="0" width="90"> <Textelement>hey ho</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(combinedbox)) # make sure __iadd__ performs like __add__ box1 += box2 self.assertEqual(want[1:], serialize(box1))
def test_multiple_textelements(self): pdf = self._parse_xml(""" <fontspec id="1" size="5" family="X" color="#00000"/> <text top="0" left="0" width="23" height="13" font="1"><b>foo</b> <b>bar</b></text> """) # test that the space between the two <b> tags doesn't get lost self.assertEqual("foo bar", str(pdf[0][0])) self.assertEqual('<Textelement tag="b">foo bar</Textelement>', serialize(pdf[0][0][0] + pdf[0][0][1]).strip()) want = """ <Textbox bottom="13" fontid="1" height="13" left="0" lineheight="0" lines="0" right="23" top="0" width="23"> <Textelement tag="b">foo </Textelement> <Textelement tag="b">bar</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(pdf[0][0])) # 2nd test, with leading non-tagged Textelement pdf = self._parse_xml(""" <fontspec id="0" size="5" family="X" color="#00000"/> <text top="374" left="508" width="211" height="14" font="0">näringsidkaren <i>en</i> <i>varning. En var-</i></text> """) want = """ <Textbox bottom="388" fontid="0" height="14" left="508" lineheight="0" lines="0" right="719" top="374" width="211"> <Textelement>näringsidkaren </Textelement> <Textelement tag="i">en </Textelement> <Textelement tag="i">varning. En var-</Textelement> </Textbox>
def parametric_test(self, filename): resultfilename = filename.replace(".txt", ".xml") debug = not os.path.exists(resultfilename) p, b = self.run_test_file(filename, debug) self.maxDiff = 4096 if os.path.exists(resultfilename): with codecs.open(resultfilename, encoding="utf-8") as fp: result = fp.read().strip() # print(elements.serialize(b)) if result != elements.serialize(b).strip(): # re-run the parse but with debugging on print("============DEBUG OUTPUT================") p.debug = True tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) b = p.parse(tr.getiterator(tr.readparagraph)) print("===============RESULT===================") print(elements.serialize(b)) self.fail("========See output above=======") else: self.assertEqual(result, elements.serialize(b).strip()) else: print("\nResult:\n" + elements.serialize(b)) self.fail()
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([ Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([ 42, date(2013, 11, 27), datetime(2013, 11, 27, 12, 0, 0), b'bytestring', { 'foo': 'bar', 'x': 'y' } ], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse( data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def test_add_different_types(self): box1 = Textbox([Textelement("hey", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1) box2 = Textbox([LinkedTextelement("1", tag="s", uri="foo.html")], fontid=None, top=0, left=50, width=5, height=10, lines=1) combinedbox = box1 + box2 want = """ <Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="55" top="0" width="55"> <Textelement>hey</Textelement> <LinkedTextelement tag="s" uri="foo.html">1</LinkedTextelement> </Textbox> """ self.assertEqual(want[1:], serialize(combinedbox)) # make sure __iadd__ performs like __add__ box1 += box2
class Elements(unittest.TestCase): maxDiff = None def test_addboxes(self): box1 = Textbox([Textelement("hey ", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1) box2 = Textbox([Textelement("ho", tag=None)], fontid=None, top=0, left=50, width=40, height=10, lines=1) combinedbox = box1 + box2 want = """ <Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="90" top="0" width="90"> <Textelement>hey ho</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(combinedbox)) # make sure __iadd__ performs like __add__ box1 += box2
serialize(pdf[0])) def test_after_footnote_tag(self): # minimized version of Prop 2011/12:60 p 147. It seems to be # the empty italized textelement, combined with the # after_footnote context, that caused a crash pdf = self._parse_xml(""" <fontspec id="0" size="12" family="Times New Roman" color="#000000"/> <fontspec id="4" size="12" family="Times New Roman,Italic" color="#000000"/> <fontspec id="9" size="7" family="Times New Roman" color="#000000"/> <text top="63" left="283" width="37" height="13" font="0">20 a §</text> <text top="60" left="320" width="5" height="9" font="9">4</text> <text top="442" left="304" width="4" height="13" font="4"><i> </i></text> <text top="460" left="306" width="41" height="13" font="4"><i>20 b § </i></text> """) # make sure that empty element is removed completely want = """ <Page height="750" number="1" width="500"> <Textbox bottom="76" fontid="0" height="16" left="283" lineheight="0" lines="0" right="325" top="60" width="42"> <Textelement>20 a §</Textelement> <Textelement tag="sup">4</Textelement> </Textbox> <Textbox bottom="473" fontid="4" height="31" left="304" lineheight="0" lines="0" right="347" top="442" width="43"> <Textelement tag="i">20 b § </Textelement> </Textbox> </Page>
def test_italic_superscript_unreliable_font(self): # the thing here is that font 2 and font 7 really has the same # font family. # ferenda.sources.legal.se.decoders.OffsetDecoder1d knows this # since it's hard-coded. The main problem is that the # OffsetDecoder1d.fontspecs methods (that aliases the fonts) # is run after PDFReader._parse_xml. Maybe we need to make # ._parse_xml call into the given textdecoder for each # fontspec tag it encounters? from ferenda.sources.legal.se.decoders import OffsetDecoder1d pdf = self._parse_xml(""" <fontspec id="2" size="14" family="MAMMBB+TT5Eo00" color="#000000"/> <fontspec id="7" size="7" family="MBAAAC+TTA1o00" color="#000000"/> <text top="552" left="340" width="372" height="17" font="2">intressant om 50 år föreslås att projektet Kulturarw</text> <text top="549" left="712" width="5" height="13" font="7"><i>3</i></text> <text top="552" left="717" width="98" height="17" font="2"> får fortsätta </text> """, OffsetDecoder1d) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="569" fontid="2" height="20" left="340" lineheight="0" lines="0" right="815" top="549" width="475"> <Textelement>intressant om 50 år föreslås att projektet Kulturarw</Textelement> <Textelement tag="is">3</Textelement> <Textelement> får fortsätta </Textelement> </Textbox> </Page> """
def parse_document_from_soup(self, soup, doc): # first run inherited version to get a doc.body tree that's # close to the actual HTML super(W3Standards, self).parse_document_from_soup(soup, doc) # then clean up doc.body best as you can with a FSMParser parser = self.get_parser() if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug try: doc.body = parser.parse(doc.body) except: print("Exception") if parser.debug: import traceback (type, value, tb) = sys.exc_info() traceback.print_exception(type, value, tb) raise PreambleSection.counter = 0 self.decorate_bodyparts(doc.body, doc.uri) if parser.debug: print(serialize(doc.body))
def test_after_footnote_tag(self): # minimized version of Prop 2011/12:60 p 147. It seems to be # the empty italized textelement, combined with the # after_footnote context, that caused a crash pdf = self._parse_xml(""" <fontspec id="0" size="12" family="Times New Roman" color="#000000"/> <fontspec id="4" size="12" family="Times New Roman,Italic" color="#000000"/> <fontspec id="9" size="7" family="Times New Roman" color="#000000"/> <text top="63" left="283" width="37" height="13" font="0">20 a §</text> <text top="60" left="320" width="5" height="9" font="9">4</text> <text top="442" left="304" width="4" height="13" font="4"><i> </i></text> <text top="460" left="306" width="41" height="13" font="4"><i>20 b § </i></text> """) # make sure that empty element is removed completely want = """ <Page height="750" number="1" width="500"> <Textbox bottom="76" fontid="0" height="16" left="283" lineheight="0" lines="0" right="325" top="60" width="42"> <Textelement>20 a §</Textelement> <Textelement tag="sup">4</Textelement> </Textbox> <Textbox bottom="473" fontid="4" height="31" left="304" lineheight="0" lines="0" right="347" top="442" width="43"> <Textelement tag="i">20 b § </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def test_space_insertion(self): # this is really more of a test of as_xhtml, but the starting point is the XML parse. The goal is to recreate the trailing, italicized, space in the second <text> element pdf = self._parse_xml(""" <fontspec id="10" size="7" family="Times New Roman" color="#000000"/> <text top="699" left="327" width="226" height="20" font="10"><i>Myndig-</i></text> <text top="720" left="327" width="230" height="20" font="10"><i>heten ska </i>lämna<i> </i></text> <text top="740" left="327" width="230" height="20" font="10"><i>enligt</i> 23 a §.</text> """) combined_tb = pdf[0][0] + pdf[0][1] + pdf[0][2] # make sure that empty element is removed completely want = """ <Textbox bottom="760" fontid="10" height="61" left="327" lineheight="0" lines="0" right="557" top="699" width="230"> <Textelement tag="i">Myndigheten ska </Textelement> <Textelement>lämna </Textelement> <Textelement tag="i">enligt</Textelement> <Textelement> 23 a §.</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(combined_tb)) res = etree.tostring(combined_tb.as_xhtml(None), encoding="utf-8", pretty_print=True).decode("utf-8") res = re.sub("p xmlns[^>]*", "p", res) want = """<p><i>Myndigheten ska </i>lämna <i>enligt</i> 23 a §.</p>""" self.assertEqual(want, res.strip())
def test_italic_superscript_unreliable_font(self): # the thing here is that font 2 and font 7 really has the same # font family. # ferenda.sources.legal.se.decoders.OffsetDecoder1d knows this # since it's hard-coded. The main problem is that the # OffsetDecoder1d.fontspecs methods (that aliases the fonts) # is run after PDFReader._parse_xml. Maybe we need to make # ._parse_xml call into the given textdecoder for each # fontspec tag it encounters? from ferenda.sources.legal.se.decoders import OffsetDecoder1d pdf = self._parse_xml( """ <fontspec id="2" size="14" family="MAMMBB+TT5Eo00" color="#000000"/> <fontspec id="7" size="7" family="MBAAAC+TTA1o00" color="#000000"/> <text top="552" left="340" width="372" height="17" font="2">intressant om 50 år föreslås att projektet Kulturarw</text> <text top="549" left="712" width="5" height="13" font="7"><i>3</i></text> <text top="552" left="717" width="98" height="17" font="2"> får fortsätta </text> """, OffsetDecoder1d) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="569" fontid="2" height="20" left="340" lineheight="0" lines="0" right="815" top="549" width="475"> <Textelement>intressant om 50 år föreslås att projektet Kulturarw</Textelement> <Textelement tag="is">3</Textelement> <Textelement> får fortsätta </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def _test_parser(self, testfile, parser): encoding = 'iso-8859-1' with codecs.open(testfile,encoding=encoding) as fp: testdata = fp.read() parts = re.split('\r?\n\r?\n',testdata,1) if len(parts) == 1: want = '' else: (testdata, want) = parts want = want.replace("\r\n", "\n").strip() # p.currentlynamedlaws = {} # needed? test_paras = re.split('\r?\n---\r?\n',testdata) got_paras = [] for para in test_paras: if para.startswith("RESET:"): parser.currentlynamedlaws.clear() if para.startswith("NOBASE:"): baseuri = None else: baseuri = 'http://rinfo.lagrummet.se/publ/sfs/9999:999' # print("Parsing %r" % para) nodes = parser.parse(para, baseuri) got_paras.append(serialize(nodes).strip()) got = "\n---\n".join(got_paras).replace("\r\n","\n").strip() self.maxDiff = None self.assertEqual(want, got)
def test_parse_recursive(self): doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") + "/" + Word(nums, exact=4).setResultsName("year") ).setResultsName("DocRef") def doc_uri_formatter(parts): return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts doc = Body([ Heading(["About Doc 43/2012 and it's interpretation"]), Paragraph([ "According to Doc 43/2012", Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated" ]) ]) result = Body([ Heading([ "About ", LinkSubject("Doc 43/2012", predicate="dcterms:references", uri="http://example.org/docs/2012/43/"), " and it's interpretation" ]), Paragraph([ "According to ", LinkSubject("Doc 43/2012", predicate="dcterms:references", uri="http://example.org/docs/2012/43/"), Footnote([ "Available at ", LinkSubject("http://example.org/xyz", predicate="dcterms:references", uri="http://example.org/xyz") ]), " the bizbaz should be frobnicated" ]) ]) cp = CitationParser(ferenda.citationpatterns.url, doc_citation) cp.set_formatter( URIFormatter(("url", ferenda.uriformats.url), ("DocRef", doc_uri_formatter))) doc = cp.parse_recursive(doc) self.maxDiff = 4096 self.assertEqual(serialize(doc), serialize(result))
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([42, date(2013,11,27), datetime(2013,11,27,12,0,0), b'bytestring', {'foo': 'bar', 'x': 'y'}], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse(data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join(doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance(part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.rdftype(self.ns['rfc'].RFC) desc.value(self.ns['dct'].title, title, lang="en") self.parse_header(header, desc) if not desc.getvalues(self.ns['dct'].identifier): desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body))
def testparser(testcase, parser, filename): """Helper function to test :py:class:`~ferenda.FSMParser` based parsers.""" wantfilename = filename.replace(".txt", ".xml") if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ: parser.debug = True tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) b = parser.parse(tr.getiterator(tr.readparagraph)) if 'FERENDA_FSMDEBUG' in os.environ: print(elements.serialize(b)) testcase.maxDiff = 4096 if os.path.exists(wantfilename): with codecs.open(wantfilename, encoding="utf-8") as fp: want = fp.read().strip() got = elements.serialize(b).strip() testcase.assertEqualXML(want, got) else: raise AssertionError("Want file not found. Result of parse:\n" + elements.serialize(b))
def test_json_roundtrip(self): # a more realistic roundtrip example with some hairy parts from ferenda import PDFDocumentRepository, PDFReader d = PDFDocumentRepository() doc = d.make_document("sample") # make SURE that the intermediate files are newer than the pdf os.utime("test/files/pdfreader/intermediate/sample.xml", None) reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir="test/files/pdfreader/intermediate") d.parse_from_pdfreader(reader, doc) jsondoc = serialize(doc, format="json") newdoc = deserialize(jsondoc, format="json") self.assertEqual(doc, newdoc)
def test_ending_whitespace_tag(self): pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4"> <Textelement>Something </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0])) pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text> <text top="706" left="148" width="4" height="18" font="3">Else</text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4"> <Textelement>Something </Textelement> </Textbox> <Textbox bottom="724" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="706" width="4"> <Textelement>Else</Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0])) # concatenate the two boxes and make sure that an additional # space is produced glued = pdf[0][0] + pdf[0][1] res = etree.tostring(glued.as_xhtml(None)).decode() res = re.sub("p xmlns[^>]*", "p", res)
def test_empty(self): pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3"> <b> </b> </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4" /> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def _test_parser(self, testfile, parser): # encoding = 'iso-8859-1' encoding = 'windows-1252' with codecs.open(testfile, encoding=encoding) as fp: testdata = fp.read() parts = re.split('\r?\n\r?\n', testdata, 1) if len(parts) == 1: want = '' else: (testdata, want) = parts want = want.replace("\r\n", "\n").strip() # p.currentlynamedlaws = {} # needed? test_paras = re.split('\r?\n---\r?\n', testdata) got_paras = [] # we need to set up logging in some way as legalref.parse will # use logging facilities. For tests though, we should only log # at CRITICAL level. import logging r = logging.getLogger() if not r.handlers: h = logging.StreamHandler() h.setFormatter( logging.Formatter("%(name)s %(levelname)s %(message)s")) r.addHandler(h) r.setLevel(logging.CRITICAL) for para in test_paras: if para.startswith("RESET:"): parser.currentlynamedlaws.clear() elif para.startswith("NOBASE:"): baseuri_attributes = {} elif para.startswith("BASE:"): b = para.split("\n")[0].split(":", 1)[1] baseuri_attributes = ast.literal_eval(b) if 'type' in baseuri_attributes: baseuri_attributes['type'] = URIRef( baseuri_attributes['type']) para = para.split("\n", 1)[1] if 'kommittensbetankande' in baseuri_attributes: parser.kommittensbetankande = baseuri_attributes[ 'kommittensbetankande'] del baseuri_attributes['kommittensbetankande'] else: baseuri_attributes = {'law': '9999:999'} nodes = parser.parse(para, self.minter, self.metadata, baseuri_attributes) got_paras.append(serialize(nodes).strip()) got = "\n---\n".join(got_paras).replace("\r\n", "\n").strip() self.maxDiff = None self.assertEqual(want, got)
def test_empty(self): pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3"> <b> </b> </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4" /> </Page> """
def parametric_test(self, filename): resultfilename = filename.replace(".txt",".xml") debug = not os.path.exists(resultfilename) p, b = self.run_test_file(filename, debug) self.maxDiff = 4096 if os.path.exists(resultfilename): with codecs.open(resultfilename,encoding="utf-8") as fp: result = fp.read().strip() # print(elements.serialize(b)) if result != elements.serialize(b).strip(): # re-run the parse but with debugging on print("============DEBUG OUTPUT================") p.debug = True tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX) b = p.parse(tr.getiterator(tr.readparagraph)) print("===============RESULT===================") print(elements.serialize(b)) self.fail("========See output above=======") else: self.assertEqual(result, elements.serialize(b).strip()) else: print("\nResult:\n"+elements.serialize(b)) self.fail()
def test_serialize_pyparsing(self): # these objects can't be roundtripped from ferenda.citationpatterns import url x = url.parseString("http://example.org/foo?param=val") serialized = serialize(Body([x])) self.assertEqual("""<Body> <url> <netloc>example.org</netloc> <path>/foo</path> <query>param=val</query> <scheme>http</scheme> </url> </Body> """, serialized)
def test_elements_from_soup(self): from ferenda.elements import html soup = BeautifulSoup( """<body> <h1>Sample</h1> <div class="main"> <img src="xyz.png"/> <p>Some <b>text</b></p> <dl> <dt>Term 1</dt> <dd>Definition 1</dd> </dl> </div> <div id="foot"> <hr/> <a href="/">home</a> - <a href="/about">about</a> </div> </body>""", "lxml") body = html.elements_from_soup(soup.body) # print("Body: \n%s" % serialize(body)) result = html.Body([ html.H1(["Sample"]), html.Div([ html.Img(src="xyz.png"), html.P(["Some ", html.B(["text"])]), html.DL([html.DT(["Term 1"]), html.DD(["Definition 1"])]) ], **{"class": "main"}), html.Div([ html.HR(), html.A(["home"], href="/"), " - ", html.A(["about"], href="/about") ], id="foot") ]) self.maxDiff = 4096 self.assertEqual(serialize(body), serialize(result))
def test_ending_whitespace_tag(self): pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4"> <Textelement>Something </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0])) pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3">Something<i> </i></text> <text top="706" left="148" width="4" height="18" font="3">Else</text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4"> <Textelement>Something </Textelement> </Textbox> <Textbox bottom="724" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="706" width="4"> <Textelement>Else</Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0])) # concatenate the two boxes and make sure that an additional # space is produced glued = pdf[0][0] + pdf[0][1] res = etree.tostring(glued.as_xhtml(None)).decode() res = re.sub("p xmlns[^>]*", "p", res) want = "<p>Something Else</p>" self.assertEqual(want, res)
def parametric_test(self, filename): self.maxDiff = None reader = TextReader(filename=filename, encoding='iso-8859-1', linesep=TextReader.DOS) reader.autostrip = True # p.lagrum_parser = FakeParser() parser = self.p.get_parser("9999:998", reader) b = parser(reader) elements = self.p._count_elements(b) # FIXME: How was this used? Where should we plug # skipfragments? if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2: self.p.skipfragments = [ ('rinfoex:avdelningnummer', 'rpubl:kapitelnummer'), ('rpubl:kapitelnummer', 'rpubl:paragrafnummer')] else: self.p.skipfragments = [('rinfoex:avdelningnummer', 'rpubl:kapitelnummer')] # NB: _construct_ids won't look for references self.p.visit_node(b, self.p.construct_id, {'basefile': '9999:998', 'uris': set()}) self.p.visit_node(b, self.p.find_definitions, False, debug=False) self.p.lagrum_parser.parse_recursive(b) self._remove_uri_for_testcases(b) resultfilename = filename.replace(".txt", ".xml") if os.path.exists(resultfilename): with codecs.open(resultfilename, encoding="utf-8") as fp: result = fp.read().strip() self.assertEqual(result, serialize(b).strip()) else: self.assertEqual("", serialize(b).strip()) # reset the state of the repo... self.p.current_section = '0' self.p.current_headline_level = 0
def test_serialize_pyparsing(self): # these objects can't be roundtripped from ferenda.citationpatterns import url x = url.parseString("http://example.org/foo?param=val") serialized = serialize(Body([x])) self.assertEqual( """<Body> <url> <netloc>example.org</netloc> <path>/foo</path> <query>param=val</query> <scheme>http</scheme> </url> </Body> """, serialized)
def test_middle_whitespace_tag(self): pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3"><b>Verksamhetsregion<i> </i></b><b>Lund </b></text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4"> <Textelement tag="b">Verksamhetsregion </Textelement> <Textelement tag="b">Lund </Textelement> </Textbox> </Page> """ # res = etree.tostring(pdf.as_xhtml(None)).decode() self.assertEqual(want[1:], serialize(pdf[0]))
def test_parse_recursive(self): doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") + "/" + Word(nums,exact=4).setResultsName("year")).setResultsName("DocRef") def doc_uri_formatter(parts): return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts doc = Body([Heading(["About Doc 43/2012 and it's interpretation"]), Paragraph(["According to Doc 43/2012", Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated"]) ]) result = Body([Heading(["About ", LinkSubject("Doc 43/2012", predicate="dct:references", uri="http://example.org/docs/2012/43/"), " and it's interpretation"]), Paragraph(["According to ", LinkSubject("Doc 43/2012", predicate="dct:references", uri="http://example.org/docs/2012/43/"), Footnote(["Available at ", LinkSubject("http://example.org/xyz", predicate="dct:references", uri="http://example.org/xyz") ]), " the bizbaz should be frobnicated"]) ]) cp = CitationParser(ferenda.citationpatterns.url, doc_citation) cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url), ("DocRef", doc_uri_formatter))) doc = cp.parse_recursive(doc) self.maxDiff = 4096 self.assertEqual(serialize(doc),serialize(result))
self.assertEqual(want,res) def test_middle_whitespace_tag(self): pdf = self._parse_xml(""" <fontspec id="3" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="686" left="148" width="4" height="18" font="3"><b>Verksamhetsregion<i> </i></b><b>Lund </b></text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="704" fontid="3" height="18" left="148" lineheight="0" lines="0" right="152" top="686" width="4"> <Textelement tag="b">Verksamhetsregion </Textelement> <Textelement tag="b">Lund </Textelement> </Textbox> </Page> """ # res = etree.tostring(pdf.as_xhtml(None)).decode()
def test_footnote(self): pdf = self._parse_xml(""" <fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <text top="830" left="85" width="241" height="20" font="7">bindande verkan för det allmänna.</text> <text top="829" left="327" width="5" height="12" font="15">7</text> <text top="830" left="332" width="227" height="20" font="7">Bestämmelsen kan således inte </text>""" ) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="850" fontid="7" height="21" left="85" lineheight="0" lines="0" right="559" top="829" width="474"> <Textelement>bindande verkan för det allmänna.</Textelement> <Textelement tag="sup">7</Textelement> <Textelement>Bestämmelsen kan således inte </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def test_footnote(self): pdf = self._parse_xml(""" <fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <text top="830" left="85" width="241" height="20" font="7">bindande verkan för det allmänna.</text> <text top="829" left="327" width="5" height="12" font="15">7</text> <text top="830" left="332" width="227" height="20" font="7">Bestämmelsen kan således inte </text>""") want = """ <Page height="750" number="1" width="500"> <Textbox bottom="850" fontid="7" height="21" left="85" lineheight="0" lines="0" right="559" top="829" width="474"> <Textelement>bindande verkan för det allmänna.</Textelement> <Textelement tag="sup">7</Textelement> <Textelement>Bestämmelsen kan således inte </Textelement> </Textbox> </Page> """
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([42, date(2013,11,27), b'bytestring', {'foo': 'bar', 'x': 'y'}], ordinal=2, title="Native types") ]) serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, globals()) self.assertEqual(tree, newtree)
def test_linked_footnote(self): pdf = self._parse_xml(""" <fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <text top="830" left="85" width="241" height="20" font="7">bindande verkan för det allmänna.</text> <text top="829" left="327" width="5" height="12" font="15"><a href="unik-kunskap-genom-registerforskning-sou-201445.html#120">7</a></text> <text top="830" left="332" width="227" height="20" font="7"><a href="unik-kunskap-genom-registerforskning-sou-201445.html#120"> </a>Bestämmelsen kan således inte </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="850" fontid="7" height="21" left="85" lines="-2" right="559" top="829" width="474"> <Textelement>bindande verkan för det allmänna.</Textelement> <LinkedTextelement tag="s" uri="unik-kunskap-genom-registerforskning-sou-201445.html#120">7</LinkedTextelement> <LinkedTextelement uri="unik-kunskap-genom-registerforskning-sou-201445.html#120"> </LinkedTextelement> <Textelement>Bestämmelsen kan således inte </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def test_comment(self): pdf = self._parse_xml(""" <fontspec id="1" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="270" left="278" width="450" height="12" font="1">First line</text> <!-- comments like this won't appear in real pdf2xml output, but might appear in test cases --> <text top="290" left="278" width="450" height="12" font="1">Second line</text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="282" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="270" width="450"> <Textelement>First line</Textelement> </Textbox> <Textbox bottom="302" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="290" width="450"> <Textelement>Second line</Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def test_comment(self): pdf = self._parse_xml(""" <fontspec id="1" size="11" family="TimesNewRomanPS" color="#000000"/> <text top="270" left="278" width="450" height="12" font="1">First line</text> <!-- comments like this won't appear in real pdf2xml output, but might appear in test cases --> <text top="290" left="278" width="450" height="12" font="1">Second line</text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="282" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="270" width="450"> <Textelement>First line</Textelement> </Textbox> <Textbox bottom="302" fontid="1" height="12" left="278" lineheight="0" lines="0" right="728" top="290" width="450"> <Textelement>Second line</Textelement> </Textbox> </Page> """
def test_footnote_footer(self): pdf = self._parse_xml(""" <fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="16" size="10" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="17" size="5" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <text top="849" left="85" width="472" height="20" font="7">ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </text> <text top="891" left="85" width="4" height="9" font="17">7</text> <text top="891" left="89" width="258" height="15" font="16"> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="869" fontid="7" height="20" left="85" lineheight="0" lines="0" right="557" top="849" width="472"> <Textelement>ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </Textelement> </Textbox> <Textbox bottom="906" fontid="16" height="15" left="85" lineheight="0" lines="0" right="347" top="891" width="262"> <Textelement tag="sup">7</Textelement> <Textelement> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
def test_footnote_footer(self): pdf = self._parse_xml(""" <fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="15" size="7" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="16" size="10" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <fontspec id="17" size="5" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/> <text top="849" left="85" width="472" height="20" font="7">ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </text> <text top="891" left="85" width="4" height="9" font="17">7</text> <text top="891" left="89" width="258" height="15" font="16"> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="869" fontid="7" height="20" left="85" lineheight="0" lines="0" right="557" top="849" width="472"> <Textelement>ligga till grund för några individuella rättigheter. I 2 kap. 4 och 5 §§ </Textelement> </Textbox> <Textbox bottom="906" fontid="16" height="15" left="85" lineheight="0" lines="0" right="347" top="891" width="262"> <Textelement tag="sup">7</Textelement> <Textelement> Prop. 1975/76:209 s. 128, prop. 2009/10:80 s. 173. </Textelement> </Textbox> </Page> """
def test_footnote_lineending(self): pdf = self._parse_xml(""" <fontspec id="0" size="13" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/> <fontspec id="4" size="13" family="GGKKID+TimesNewRomanPS-ItalicMT" color="#000000"/> <fontspec id="7" size="7" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/> <text top="161" left="291" width="401" height="17" font="0">Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</text> <text top="159" left="692" width="5" height="11" font="7">7</text> <text top="161" left="697" width="4" height="17" font="0"> </text> <text top="178" left="291" width="249" height="17" font="4"><i>dels</i> att 1 kap. 12 § ska upphöra att gälla, </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="178" fontid="0" height="19" left="291" lineheight="0" lines="0" right="697" top="159" width="406"> <Textelement>Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</Textelement> <Textelement tag="sup">7</Textelement> </Textbox> <Textbox bottom="195" fontid="4" height="17" left="291" lineheight="0" lines="0" right="540" top="178" width="249"> <Textelement tag="i">dels</Textelement> <Textelement> att 1 kap. 12 § ska upphöra att gälla, </Textelement> </Textbox> </Page> """ self.assertEqual(want[1:], serialize(pdf[0]))
self.assertEqual(want[1:], serialize(pdf[0])) def test_space_insertion(self): # this is really more of a test of as_xhtml, but the starting point is the XML parse. The goal is to recreate the trailing, italicized, space in the second <text> element pdf = self._parse_xml(""" <fontspec id="10" size="7" family="Times New Roman" color="#000000"/> <text top="699" left="327" width="226" height="20" font="10"><i>Myndig-</i></text> <text top="720" left="327" width="230" height="20" font="10"><i>heten ska </i>lämna<i> </i></text> <text top="740" left="327" width="230" height="20" font="10"><i>enligt</i> 23 a §.</text> """) combined_tb = pdf[0][0] + pdf[0][1] + pdf[0][2] # make sure that empty element is removed completely want = """ <Textbox bottom="760" fontid="10" height="61" left="327" lineheight="0" lines="0" right="557" top="699" width="230"> <Textelement tag="i">Myndigheten ska </Textelement> <Textelement>lämna </Textelement> <Textelement tag="i">enligt</Textelement> <Textelement> 23 a §.</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(combined_tb)) res = etree.tostring(combined_tb.as_xhtml(None), encoding="utf-8", pretty_print=True).decode("utf-8") res = re.sub("p xmlns[^>]*", "p", res)
def fsmparse(self, functionname, source): """Parse a list of text chunks using a named fsm parser and output the parse tree and final result to stdout. :param functionname: A function that returns a configured :py:class:`~ferenda.FSMParser` :type functionname: str :param source: A file containing the text chunks, separated by double newlines :type source: str """ modulename, classname, methodname = functionname.rsplit(".", 2) __import__(modulename) m = sys.modules[modulename] for name, cls in inspect.getmembers(m, inspect.isclass): if name == classname: break method = getattr(cls,methodname) parser = method() parser.debug = True tr = TextReader(source) b = parser.parse(tr.getiterator(tr.readparagraph)) print(serialize(b))
def test_footnote_lineending(self): pdf = self._parse_xml(""" <fontspec id="0" size="13" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/> <fontspec id="4" size="13" family="GGKKID+TimesNewRomanPS-ItalicMT" color="#000000"/> <fontspec id="7" size="7" family="GGKKGC+TimesNewRomanPSMT" color="#000000"/> <text top="161" left="291" width="401" height="17" font="0">Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</text> <text top="159" left="692" width="5" height="11" font="7">7</text> <text top="161" left="697" width="4" height="17" font="0"> </text> <text top="178" left="291" width="249" height="17" font="4"><i>dels</i> att 1 kap. 12 § ska upphöra att gälla, </text> """) want = """ <Page height="750" number="1" width="500"> <Textbox bottom="178" fontid="0" height="19" left="291" lineheight="0" lines="0" right="697" top="159" width="406"> <Textelement>Härigenom föreskrivs i fråga om mervärdesskattelagen (1994:200)</Textelement> <Textelement tag="sup">7</Textelement> </Textbox> <Textbox bottom="195" fontid="4" height="17" left="291" lineheight="0" lines="0" right="540" top="178" width="249"> <Textelement tag="i">dels</Textelement> <Textelement> att 1 kap. 12 § ska upphöra att gälla, </Textelement> </Textbox> </Page> """
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join( doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance( part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True
def test_serialize_newstr(self): # really a test for future.types.newstr.newstr, here aliased # to str() -- this is only ever an issue on py2. tree = Body([], a=str("x"), b="y") serialized = serialize(tree, format="xml") self.assertEqual('<Body a="x" b="y" />\n', serialized)
but by the catch-all is_paragraph. The recognizers are run in the order specified by FSMParser.set_transitions(). This is a preformatted section. It could be used for source code, +-------------------+ | line drawings | +-------------------+ or what have you. Second section ============== The above new section implicitly closed the first section which we were in. This was made explicit by the last transition rule, which stated that any time a section is encountered while in the "section" state, we should not create any more children (False) but instead return to our previous state (which in this case is "body", but for a more complex language could be any number of states).""" p = FSMParser() p.set_recognizers(is_section, is_preformatted, is_paragraph) p.set_transitions(transitions) p.initial_constructor = make_body p.initial_state = "body" body = p.parse(text.split("\n\n")) # print(elements.serialize(body)) # end main return_value = elements.serialize(body)
def wrapper(self, doc): # call the actual function that creates the doc data oldbasefile = doc.basefile ret = f(self, doc) if doc.basefile != oldbasefile: # means that basefile was adjusted. Touch the old parsed # path first so we don't regenerate. with self.store.open_parsed(oldbasefile, "w"): pass # move any intermediate files (in particular extracted # image backgrounds from PDF files) that might be # needed later. old_intermediate = self.store.intermediate_path(oldbasefile) new_intermediate = self.store.intermediate_path(doc.basefile) if self.store.storage_policy == "dir": old_intermediate = os.path.dirname(old_intermediate) new_intermediate = os.path.dirname(new_intermediate) if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate): util.ensure_dir(new_intermediate) os.rename(old_intermediate, new_intermediate) # now render thath doc data as files (JSON, XHTML, RDF/XML) if self.config.serializejson == True: with self.store.open_serialized(doc.basefile, "wb") as fp: r = serialize(doc, format="json") # should be a (unicode) str fp.write(r.encode('utf-8')) self.log.debug( "Created %s" % (self.store.serialized_path( doc.basefile))) # css file + background images + png renderings of text resources = self.create_external_resources(doc) if resources: cssuris = [cssuri(doc.uri, x) for x in resources if x.endswith(".css")] else: cssuris = [] if cssuris: doc.cssuris = cssuris updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile, version=doc.version)) if updated: self.log.debug( "Created %s" % (self.store.parsed_path( doc.basefile))) # Extract all triples on the XHTML/RDFa data to a separate # RDF/XML file distilled_graph = Graph() with codecs.open(self.store.parsed_path(doc.basefile, version=doc.version), encoding="utf-8") as fp: # unicode distilled_graph.parse(data=fp.read(), format="rdfa", publicID=doc.uri) # The act of parsing from RDFa binds a lot of namespaces # in the graph in an unneccesary manner. Particularly it # binds both 'dc' and 'dcterms' to # 'http://purl.org/dc/terms/', which makes serialization # less than predictable. Blow these prefixes away. distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/")) distilled_graph.bind( "dcterms", URIRef("http://example.org/this-prefix-should-not-be-used")) util.ensure_dir(self.store.distilled_path(doc.basefile, version=doc.version)) with open(self.store.distilled_path(doc.basefile, version=doc.version), "wb") as distilled_file: # print("============distilled===============") # print(distilled_graph.serialize(format="turtle").decode('utf-8')) distilled_graph.serialize(distilled_file, format="pretty-xml") self.log.debug( '%s triples extracted to %s', len(distilled_graph), self.store.distilled_path(doc.basefile, version=doc.version)) # Validate that all required triples are present (we check # distilled_graph, but we could just as well check doc.meta) required = sorted(set(self.get_required_predicates(doc))) for p in required: x = distilled_graph.value(URIRef(doc.uri), p) if not x: self.log.warning("Metadata is missing a %s triple" % (distilled_graph.qname(p))) if 'validaterdfa' in self.config and self.config.validaterdfa: # Validate that all triples specified in doc.meta and any # .meta property on any body object is present in the # XHTML+RDFa file. NOTE: graph_diff has suddenly become # glacial on medium-large graphs (> ~1000 triples). Maybe we # don't have to validate them? huge_graph = False for g in iterate_graphs(doc.body): doc.meta += g if len(doc.meta) > 1000: huge_graph = True break if huge_graph: self.log.warning("Graph seems huge, skipping validation") else: # self.log.debug("diffing graphs") (in_both, in_first, in_second) = graph_diff(doc.meta, distilled_graph) self.log.debug("graphs diffed (-%s, +%s)" % (len(in_first), len(in_second))) if in_first: # original metadata not present in the XHTML filee self.log.warning("%d triple(s) from the original metadata was " "not found in the serialized XHTML file:\n%s", len(in_first), in_first.serialize(format="n3").decode("utf-8")) # Validate that entry.title and entry.id has been filled # (might be from doc.meta and doc.uri, might be other things entry = DocumentEntry(self.store.documententry_path(doc.basefile, version=doc.version)) if not entry.id: self.log.warning("entry.id missing") if not entry.title: self.log.warning("entry.title missing") return ret
def do_test(self, keywords, want): repo = LNKeyword() body = repo.toc_generate_page_body(map(self.makeitem, keywords), None) got = serialize(body[1]) self.assertEqual(want, got)