def test_addboxes(self): box1 = Textbox([Textelement("hey ", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1) box2 = Textbox([Textelement("ho", tag=None)], fontid=None, top=0, left=50, width=40, height=10, lines=1) combinedbox = box1 + box2 want = """ <Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="90" top="0" width="90"> <Textelement>hey ho</Textelement> </Textbox> """ self.assertEqual(want[1:], serialize(combinedbox)) # make sure __iadd__ performs like __add__ box1 += box2 self.assertEqual(want[1:], serialize(box1))
def test_leading_tag(self): body = Textbox([ Textelement("bold", tag="b"), Textelement("normal", tag=None), ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px"><b>bold</b>normal</p> """ self._test_asxhtml(want, body)
def test_superscripts(self): body = Textbox([ Textelement("1", tag="sup"), Textelement("2", tag="is"), Textelement("3", tag="bis") ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px"><sup>1</sup><i><sup>2</sup></i><b><i><sup>3</sup></i></b></p> """ self._test_asxhtml(want, body)
def test_add_different_types(self): box1 = Textbox([Textelement("hey", tag=None)], fontid=None, top=0, left=0, width=50, height=10, lines=1) box2 = Textbox([LinkedTextelement("1", tag="s", uri="foo.html")], fontid=None, top=0, left=50, width=5, height=10, lines=1) combinedbox = box1 + box2 want = """ <Textbox bottom="10" fontid="0" height="10" left="0" lineheight="0" lines="1" right="55" top="0" width="55"> <Textelement>hey</Textelement> <LinkedTextelement tag="s" uri="foo.html">1</LinkedTextelement> </Textbox> """ self.assertEqual(want[1:], serialize(combinedbox)) # make sure __iadd__ performs like __add__ box1 += box2 self.assertEqual(want[1:], serialize(box1))
def test_tag_merge(self): body = Textbox([ Textelement("identical ", tag=None), Textelement("tags ", tag=None), Textelement("should ", tag="b"), Textelement("merge", tag="b"), ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">identical tags <b>should merge</b></p> """ self._test_asxhtml(want, body)
def test_elements_with_tags(self): body = Textbox([ Textelement("normal", tag=None), Textelement("bold", tag="b"), Textelement("italic", tag="i"), Textelement("both", tag="bi") ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">normal<b>bold</b><i>italic</i><b><i>both</i></b></p> """ self._test_asxhtml(want, body)
def test_linkelements(self): body = Textbox([ Textelement("normal", tag=None), LinkedTextelement("link", uri="http://example.org/", tag=None), Textelement("footnote marker", tag="sup"), LinkedTextelement( "linked footnote marker", uri="http://example.org/", tag="s") ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">normal<a href="http://example.org/">link</a><sup>footnote marker</sup><a href="http://example.org/"><sup>linked footnote marker</sup></a></p> """ self._test_asxhtml(want, body)
def test_basic(self): body = Textbox([Textelement("test", tag=None)], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">test</p> """ self._test_asxhtml(want, body)
def test_empty_removal(self): body = Textbox([ LinkedTextelement(" ", uri="index.html#24", tag=None), Textelement("23", tag=None) ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">23</p> """ self._test_asxhtml(want, body)
def __call__(self, textbox, fontspecs): if 'encoding' not in fontspecs[ textbox.fontid]: # only for some testcases return textbox if fontspecs[textbox.fontid]['encoding'] != "Custom": return textbox # NOTE: This weird checking for occurrences of 'i' # tags is needed for functionalSources. # TestPropRegeringen.test_parse_1999_2000_17 to pass # (and matches encoding usage in practice) decode_all = not ('i' in [getattr(x, 'tag', None) for x in textbox]) for idx, subpart in enumerate(textbox): if (isinstance(subpart, Textelement) and (decode_all or subpart.tag == 'i')): textbox[idx] = Textelement(self.decode_string( subpart, self.map), tag=subpart.tag) return textbox
def test_other_elements(self): body = Textbox([ Textelement("plaintext ", tag=None), LinkSubject("link", uri="http://example.org/", predicate="dcterms:references"), " raw string" ], top=0, left=0, width=100, height=100, fontid=0) want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">plaintext <a href="http://example.org/" rel="dcterms:references">link</a> raw string</p> """ self._test_asxhtml(want, body) # remove the last str so that the linksubject becomes the last item body[:] = body[:-1] want = """ <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">plaintext <a href="http://example.org/" rel="dcterms:references">link</a></p> """ self._test_asxhtml(want, body)
def __call__(self, textbox, fontspecs): if fontspecs[textbox.fontid]['encoding'] != "Custom": return textbox if textbox.font.family == "Times.New.Roman.Fet0100": boundary = None # extra special hack for prop 1997/98:44 which has # textelements marked as having a font with custom # encoding, but where only the bolded part (which # isn't marked up...) is encoded, while the rest is # unencoded. The "g" is a encoded section sign, which # in these cases is the last encoded char. if (len(textbox[0].split(" ", 2)) == 3 and textbox[0].split(" ", 2)[1] == "g"): boundary = textbox[0].index(" ", textbox[0].index(" ") + 1) # a similar situation with paragraphs with leading bold # type, where the bold text is any of 3-4 fixed strings # (Note: the xml data doesn't contain any information # about the text being bold, or rather that the following # text is non-bold) else: m = self.re_fixedleaders.match(textbox[0]) if m: boundary = m.end() if boundary: orig = str(textbox[0]) textbox[0] = Textelement(self.decode_string( orig[:boundary], self.map), tag="b") textbox.insert(1, Textelement(orig[boundary:], tag=None)) # Find the id for the "real" non-bold font. I think # that in every known case the fontid should simply be # the default font (id=0). Maybe we could hardcode # that right away, like we hardcode the font family # name right now. newfontid = self.find_fontid(fontspecs, "Times-Roman", textbox.font.size) expected_length = 2 else: textbox[0] = Textelement(self.decode_string( textbox[0], self.map), tag=textbox[0].tag) expected_length = 1 newfontid = textbox.fontid if len( textbox ) > expected_length: # the <text> element contained subelements # save and remove the 1-2 textelements we've processed decoded = textbox[:expected_length] textbox[:] = textbox[expected_length:] # do the default decoding textbox = super(OffsetDecoder20, self).__call__(textbox, fontspecs) # then add the previously procesed elements textbox[:] = decoded + textbox[:] if newfontid != textbox.fontid: # invalidate the cached property del textbox.__dict__['font'] textbox.fontid = newfontid else: textbox = super(OffsetDecoder20, self).__call__(textbox, fontspecs) # again, if one or more textelements have an "i" tag, the # font for the entire textbox probably shouldn't be # specced as an italic ("Kursiv") if textbox.font.family == "Times.New.Roman.Kursiv0104" and "i" in [ x.tag for x in textbox ]: newfontid = self.find_fontid(fontspecs, "Times-Roman", textbox.font.size) # invalidate the cached property del textbox.__dict__['font'] textbox.fontid = newfontid return textbox