Example #1
0
 def test_1d_encoding(self):
     # since version 0.57.0, pdftohtml silently discards some
     # invalid characters from output. Problem is, we have a font
     # with custom encoding, in wich 0x03 (ctrl-c) represent
     # space. pdftohtml 0.57+ drops this character before we get a
     # chance to read and decode it. Until we can do something
     # about recent pdftohtml versions, we make sure that this test
     # uses pre-generated output from an older version.
     from ferenda.sources.legal.se.decoders import OffsetDecoder1d
     #try:
     #    reader = PDFReader(filename="test/files/pdfreader/custom-encoding.pdf",
     #                       workdir=self.datadir,
     #                       textdecoder=OffsetDecoder1d())
     #except errors.ExternalCommandError as e:
     #    print("test_custom_encoding got ExternalCommandError %s, copying sample and retrying" % e)
     self._copy_sample()
     reader = PDFReader(filename="test/files/pdfreader/custom-encoding.pdf",
                        workdir=self.datadir,
                        textdecoder=OffsetDecoder1d())
     # textbox 5 and 6 uses a font with a custom encoding, make
     # sure that this is properly decoded.
     tbs = list(reader.textboxes())
     self.assertEqual("Göran Persson", str(tbs[5]))
     self.assertEqual("Bosse Ringholm", str(tbs[6]))
     self.assertEqual("(Finansdepartementet)", str(tbs[7]))
Example #2
0
 def test_1d_encoding(self):
     # since version 0.57.0, pdftohtml silently discards some
     # invalid characters from output. Problem is, we have a font
     # with custom encoding, in wich 0x03 (ctrl-c) represent
     # space. pdftohtml 0.57+ drops this character before we get a
     # chance to read and decode it. Until we can do something
     # about recent pdftohtml versions, we make sure that this test
     # uses pre-generated output from an older version.
     from ferenda.sources.legal.se.decoders import OffsetDecoder1d
     #try:
     #    reader = PDFReader(filename="test/files/pdfreader/custom-encoding.pdf",
     #                       workdir=self.datadir,
     #                       textdecoder=OffsetDecoder1d())
     #except errors.ExternalCommandError as e:
     #    print("test_custom_encoding got ExternalCommandError %s, copying sample and retrying" % e)
     self._copy_sample()
     reader = PDFReader(filename="test/files/pdfreader/custom-encoding.pdf",
                        workdir=self.datadir,
                        textdecoder=OffsetDecoder1d())
     # textbox 5 and 6 uses a font with a custom encoding, make
     # sure that this is properly decoded.
     tbs = list(reader.textboxes())
     self.assertEqual("Göran Persson", str(tbs[5]))
     self.assertEqual("Bosse Ringholm", str(tbs[6]))
     self.assertEqual("(Finansdepartementet)", str(tbs[7]))
Example #3
0
    def test_bz2(self):
        try:
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir,
                               keep_xml="bz2")
        except errors.ExternalCommandError:
            self._copy_sample()
            # bzip2 our canned sample.xml
            with open(self.datadir + os.sep + "sample.xml", "rb") as rfp:
                wfp = BZ2File(self.datadir + os.sep + "sample.xml.bz2", "wb")
                wfp.write(rfp.read())
                wfp.close()
            os.unlink(self.datadir + os.sep + "sample.xml")
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir,
                               keep_xml="bz2")

        # a temporary copy of the pdf file should not be lying around in workdir
        self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf"))
        # but the XML file (only in bz2 format) should be stored
        self.assertTrue(
            os.path.exists(self.datadir + os.sep + "sample.xml.bz2"))
        self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.xml"))

        # first page, first box
        self.assertEqual("Document title ", str(reader[0][0]))

        # parsing again should reuse the existing sample.xml.bz2
        reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                           workdir=self.datadir,
                           keep_xml="bz2")
Example #4
0
    def parse(self, doc):
        doc.uri = self.canonical_uri(doc.basefile)
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        self.infer_triples(d, doc.basefile)

        # prefer PDF or Word files over the plaintext-containing HTML files
        # FIXME: PDF or Word files are now stored as attachments

        pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf')

        wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'),
                     self.generic_path(doc.basefile, 'downloaded', '.docx'),
                     self.generic_path(doc.basefile, 'downloaded', '.wpd'),
                     self.generic_path(doc.basefile, 'downloaded', '.rtf'))
        wordfile = None
        for f in wordfiles:
            if os.path.exists(f):
                wordfile = f

        # if we lack a .pdf file, use Open/LibreOffice to convert any
        # .wpd or .doc file to .pdf first
        if (wordfile
                and not os.path.exists(pdffile)):
            intermediate_pdf = self.generic_path(
                doc.basefile, "intermediate", ".pdf")
            if not os.path.exists(intermediate_pdf):
                cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'),
                                                                             os.path.dirname(
                                                                                 intermediate_pdf),
                                                                             wordfile)
                self.log.debug(
                    "%s: Converting to PDF: %s" % (doc.basefile, cmdline))
                (ret, stdout, stderr) = util.runcmd(
                    cmdline, require_success=True)
            pdffile = intermediate_pdf

        if os.path.exists(pdffile):
            self.log.debug("%s: Using %s" % (doc.basefile, pdffile))
            intermediate_dir = os.path.dirname(
                self.generic_path(doc.basefile, 'intermediate', '.foo'))
            self.setup_logger('pdfreader', self.config.get('log', 'INFO'))
            pdfreader = PDFReader()
            pdfreader.read(pdffile, intermediate_dir)
            self.parse_from_pdfreader(pdfreader, doc)
        else:
            downloaded_path = self.downloaded_path(doc.basefile)
            intermediate_path = self.generic_path(
                doc.basefile, 'intermediate', '.txt')
            self.log.debug("%s: Using %s (%s)" % (doc.basefile,
                           downloaded_path, intermediate_path))
            if not os.path.exists(intermediate_path):
                html = codecs.open(
                    downloaded_path, encoding="iso-8859-1").read()
                util.writefile(intermediate_path, util.extract_text(
                    html, '<pre>', '</pre>'), encoding="utf-8")
            textreader = TextReader(intermediate_path, encoding="utf-8")
            self.parse_from_textreader(textreader, doc)
Example #5
0
class Read(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.datadir = tempfile.mkdtemp()
        self.reader = PDFReader()
        
    def tearDown(self):
        shutil.rmtree(self.datadir)

    def test_basic(self):
        try:
            self.reader.read("test/files/pdfreader/sample.pdf",
                             self.datadir)
        except errors.ExternalCommandError:
            for fname in os.listdir("test/files/pdfreader/intermediate"):
                to = fname.replace("index", "sample")
                shutil.copy("test/files/pdfreader/intermediate/%s" % fname,
                             self.datadir + os.sep + to)
            self.reader.read("test/files/pdfreader/sample.pdf",
                             self.datadir)
        self.assertEqual(len(self.reader), 1)
        # first page, first box
        title = str(self.reader[0][0])
        self.assertEqual("Document title ", title)

        self.assertEqual(318, self.reader.median_box_width())

        page = self.reader[0]
        self.assertEqual("Page 1 (892 x 1263): 'Document title  This is a simple documen...'", str(page))

        # an uncropped doc should have two textboxes
        self.assertEqual(2, len(list(page.boundingbox())))

        # a smaller bounding box yields just one
        self.assertEqual(1,
                         len(list(page.boundingbox(190, 130, 230, 460))))

        # cropping it with the same dimensions
        page.crop(190, 130, 230, 460)

        # should also result in just one box -- the bottom one
        boxes = list(page.boundingbox())
        self.assertEqual(1, len(boxes))

        box = boxes[0]

        self.assertEqual("This is a simple document in PDF format. ", str(box))
        self.assertEqual({'color': '#000000',
                          'size': '16',
                          'id': '1',
                          'family': 'Times'}, box.getfont())
                         

        # this box should have four text elements
        self.assertEqual(4, len(box))
        self.assertEqual(None, box[0].tag)
        self.assertEqual("i", box[1].tag)
        self.assertEqual("ib", box[2].tag)
        self.assertEqual(None, box[3].tag)
    def pdfreader_from_basefile(self, basefile):
        pdffile = self.store.downloaded_path(basefile)
        # Convoluted way of getting the directory of the intermediate
        # xml + png files that PDFReader will create

        intermediate_dir = os.path.dirname(self.store.intermediate_path(basefile))
        pdf = PDFReader()
        pdf.read(pdffile, intermediate_dir)
        return pdf
Example #7
0
    def test_fallback_ocr(self):
        try:
            # actually running tesseract takes ages -- for day-to-day
            # testing we can just as well use the canned hocr.html
            # files that _copy_sample fixes for us.
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(
                filename="test/files/pdfreader/scanned-ecma-99.pdf",
                workdir=self.datadir,
                images=False)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(
                filename="test/files/pdfreader/scanned-ecma-99.pdf",
                workdir=self.datadir,
                images=False)

        self.assertTrue(reader.is_empty())
        reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                           workdir=self.datadir,
                           ocr_lang="eng")
        self.assertFalse(reader.is_empty())
        self.assertEqual(2, len(reader))
        self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION",
                         util.normalize_space(str(reader[0][1])))
Example #8
0
    def _parse_xml(self, xmlfrag, decoding_class=BaseTextDecoder):
        pdf = PDFReader(pages=True)
        pdf.fontspec = {}
        pdf._textdecoder = decoding_class()
        xml = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">
<pdf2xml producer="poppler" version="0.24.3">
<page number="1" position="absolute" top="0" left="0" height="750" width="500">
%s
</page>
</pdf2xml>""" % xmlfrag
        xmlfp = BytesIO(xml.encode("utf-8"))
        xmlfp.name = "dummy.xml"
        pdf._parse_xml(xmlfp)
        return pdf
Example #9
0
    def test_dontkeep(self):
        self.assertFalse(
            os.path.exists(self.datadir + os.sep + "sample.xml.bz2"))
        try:
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir,
                               keep_xml=False)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir,
                               keep_xml=False)

        # No XML file should exist
        self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.xml"))
        self.assertFalse(
            os.path.exists(self.datadir + os.sep + "sample.xml.bz2"))
Example #10
0
 def setUp(self):
     # create a mock analyzer
     analyzer = PDFAnalyzer(None)
     analyzer.scanned_source = self.scanned_source
     self.gluefunc = Offtryck().get_gluefunc('basefile', analyzer)
     self.pdfreader = PDFReader()
     self.pdfreader.fontspec = {}
     self.pdfreader._textdecoder = lambda x, y: x
     self.pdfreader._textdecoder.fontspec = lambda x: x
Example #11
0
 def setUp(self):
     # create a mock analyzer
     analyzer = PDFAnalyzer(None)
     analyzer.scanned_source = False
     self.gluefunc = Offtryck().get_gluefunc('basefile', analyzer)
     self.pdfreader = PDFReader()
     self.pdfreader.fontspec = {}
     from ferenda.sources.legal.se.decoders import OffsetDecoder20
     self.pdfreader._textdecoder = OffsetDecoder20()
Example #12
0
 def test_json_roundtrip(self):
     # a more realistic roundtrip example with some hairy parts
     from ferenda import PDFDocumentRepository, PDFReader
     d = PDFDocumentRepository()
     doc = d.make_document("sample")
     # make SURE that the intermediate files are newer than the pdf
     os.utime("test/files/pdfreader/intermediate/sample.xml", None)
     reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                        workdir="test/files/pdfreader/intermediate")
     d.parse_from_pdfreader(reader, doc)
     jsondoc = serialize(doc, format="json")
     newdoc = deserialize(jsondoc, format="json")
     self.assertEqual(doc, newdoc)
Example #13
0
    def test_fallback_ocr(self):
        try:
            # actually running tesseract takes ages -- for day-to-day
            # testing we can just as well use the canned hocr.html
            # files that _copy_sample fixes for us.
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                               workdir=self.datadir,
                               images=False)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                               workdir=self.datadir,
                               images=False)

        self.assertTrue(reader.is_empty())
        reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                           workdir=self.datadir,
                           ocr_lang="eng")
        self.assertFalse(reader.is_empty())
        self.assertEqual(2, len(reader))
        self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION",
                         util.normalize_space(str(reader[0][1])))
Example #14
0
    def test_ocr(self):
        try:
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(filename="test/files/pdfreader/scanned.pdf",
                               workdir=self.datadir,
                               ocr_lang="swe")
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/scanned.pdf",
                               workdir=self.datadir,
                               ocr_lang="swe")

        # assert that a hOCR file has been created
        self.assertTrue(
            os.path.exists(self.datadir + os.sep + "scanned.hocr.html"))

        # assert that we have two pages
        self.assertEqual(2, len(reader))

        # assert that first element in the first textbox in the first
        # page corresponds to the first bbox, scaled by the
        # pixel/point scaling factor.
        self.assertEqual("Regeringens ", str(reader[0][0][0]))
        self.assertEqual(47, reader[0][0][0].top)
        self.assertEqual(38, reader[0][0][0].left)
        self.assertEqual(21, reader[0][0][0].height)
        self.assertEqual(118, reader[0][0][0].width)

        # assert that the <s>third</s>fifth textbox (which has mostly
        # normal text) is rendered correctly (note that we have a
        # couple of OCR errors).
        # self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i bifogade utdrag ur regeringsprotokollet den 31 oktober l99l.", util.normalize_space(str(reader[0][3])))
        self.assertEqual(
            "Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i",
            util.normalize_space(str(reader[0][5])))
Example #15
0
    def pdfreader_from_basefile(self, basefile):
        pdffile = self.store.downloaded_path(basefile)
        # Convoluted way of getting the directory of the intermediate
        # xml + png files that PDFReader will create

        intermediate_dir = os.path.dirname(
            self.store.intermediate_path(basefile))
        if self.config.compress == "bz2":
            keep_xml = "bz2"
        else:
            keep_xml = True
        pdf = PDFReader(filename=pdffile,
                        workdir=intermediate_dir,
                        images=self.config.pdfimages,
                        keep_xml=keep_xml)
        return pdf
Example #16
0
 def test_autodetect_encoding(self):
     from ferenda.sources.legal.se.decoders import DetectingDecoder
     self._copy_sample()
     reader = PDFReader(
         filename="test/files/pdfreader/multiple-encodings.pdf",
         workdir=self.datadir,
         textdecoder=DetectingDecoder())
     page = reader[0]
     self.assertEqual(
         "Detta är helt vanlig icke-kodad text på svenska.",
         str(page[0]))  # unencoded (but marked as Custom encoding)
     self.assertEqual(
         "mellan Konungariket Sveriges regering och Konungariket Danmarks",
         str(page[1]))  # basic encoding (0x1d)
     self.assertEqual("Skälen för regeringens bedömning och förslag",
                      str(page[2]))  # other encoding (0x20
Example #17
0
    def _parse_xml(self, xmlfrag, decoding_class=BaseTextDecoder):
        pdf = PDFReader(pages=True)
        pdf.fontspec = {}
        pdf._textdecoder = decoding_class()
        xml = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">
<pdf2xml producer="poppler" version="0.24.3">
<page number="1" position="absolute" top="0" left="0" height="750" width="500">
%s
</page>
</pdf2xml>""" % xmlfrag
        xmlfp = BytesIO(xml.encode("utf-8"))
        xmlfp.name = "dummy.xml"
        pdf._parse_xml(xmlfp)
        return pdf
Example #18
0
 def setUp(self):
     self.maxDiff = None
     self.pdf = PDFReader(filename="test/files/pdfanalyze/lipsum.pdf",
                          workdir="test/files/pdfanalyze/")
     self.analyzer = PDFAnalyzer(self.pdf)
Example #19
0
    def test_basic(self):
        try:
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir)

        # a temporary copy of the pdf file should not be lying around
        # in workdir
        # print("Checking if %s has been unlinked" % (self.datadir +
        # os.sep + "sample.pdf"))
        self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf"))
        # but the XML file should be stored for subsequent parses
        self.assertTrue(os.path.exists(self.datadir + os.sep + "sample.xml"))

        # The PDF contained actual textboxes
        self.assertFalse(reader.is_empty())

        self.assertEqual(len(reader), 1)
        # first page, first box
        title = str(reader[0][0])
        self.assertEqual("Document title ", title)

        self.assertEqual(570, reader.median_box_width())

        page = reader[0]
        self.assertEqual("Page 1 (892 x 1263): 'Document title  This is a simple documen...'", str(page))

        
        # an uncropped doc should have nine nonempty textboxes
        self.assertEqual(9, len(list(page.boundingbox())))

        # a smaller bounding box yields just one
        self.assertEqual(1,
                         len(list(page.boundingbox(190, 130, 230, 460))))

        # cropping it with the same dimensions
        # NOTE: This will fail if convert (from imagemagick) isn't installed)
        try:
            page.crop(190, 130, 230, 460)
        except errors.ExternalCommandError:
            # the rest of the tests cannot succeed now. FIXME: We
            # should try to find a way to run them anyway
            return

        # should also result in just one box -- the bottom one
        boxes = list(page.boundingbox())
        self.assertEqual(1, len(boxes))

        box = boxes[0]

        self.assertEqual("This is a simple document in PDF format. ", str(box))
        self.assertEqual('#000000', box.font.color)
        self.assertEqual(16, box.font.size)
        self.assertEqual('1', box.font.id)
        self.assertEqual('Cambria', box.font.family)
                         

        # this box should have four text elements
        self.assertEqual(4, len(box))
        self.assertEqual(None, box[0].tag)
        self.assertEqual("i", box[1].tag)
        self.assertEqual("ib", box[2].tag)
        self.assertEqual(None, box[3].tag)
Example #20
0
    def test_basic(self):
        try:
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/sample.pdf",
                               workdir=self.datadir)

        # a temporary copy of the pdf file should not be lying around
        # in workdir
        # print("Checking if %s has been unlinked" % (self.datadir +
        # os.sep + "sample.pdf"))
        self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf"))
        # but the XML file should be stored for subsequent parses
        self.assertTrue(os.path.exists(self.datadir + os.sep + "sample.xml"))

        # The PDF contained actual textboxes
        self.assertFalse(reader.is_empty())

        self.assertEqual(len(reader), 1)
        # first page, first box
        title = str(reader[0][0])
        self.assertEqual("Document title ", title)

        self.assertEqual(570, reader.median_box_width())

        page = reader[0]
        self.assertEqual(
            "Page 1 (892 x 1263): 'Document title  This is a simple documen...'",
            str(page))

        # an uncropped doc should have nine nonempty textboxes
        self.assertEqual(9, len(list(page.boundingbox())))

        # a smaller bounding box yields just one
        self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460))))

        # cropping it with the same dimensions
        # NOTE: This will fail if convert (from imagemagick) isn't installed)
        try:
            page.crop(190, 130, 230, 460)
        except errors.ExternalCommandError:
            # the rest of the tests cannot succeed now. FIXME: We
            # should try to find a way to run them anyway
            return

        # should also result in just one box -- the bottom one
        boxes = list(page.boundingbox())
        self.assertEqual(1, len(boxes))

        box = boxes[0]

        self.assertEqual("This is a simple document in PDF format. ", str(box))
        self.assertEqual('#000000', box.font.color)
        self.assertEqual(16, box.font.size)
        self.assertEqual('1', box.font.id)
        self.assertEqual('Cambria', box.font.family)

        # this box should have four text elements
        self.assertEqual(4, len(box))
        self.assertEqual(None, box[0].tag)
        self.assertEqual("i", box[1].tag)
        self.assertEqual("ib", box[2].tag)
        self.assertEqual(None, box[3].tag)
Example #21
0
    def test_20_encoding(self):
        # for this file, we don't even have a real PDF file, just some
        # copypasted excerpts from an intermediate XML file
        from ferenda.sources.legal.se.decoders import OffsetDecoder20
        self._copy_sample()
        reader = PDFReader(
            filename="test/files/pdfreader/prop_1997_98_44.pdf",
            workdir=self.datadir,
            textdecoder=OffsetDecoder20(kommittenamn="Datalagskommittén"))
        page = reader[0]
        self.assertEqual("Personuppgiftslag", str(page[0]))  # unencoded
        self.assertEqual("Laila Freivalds", str(page[1]))  # basic encoding
        self.assertEqual("Pierre Schori", str(page[2]))  # basic encoding
        self.assertEqual("Härigenom föreskrivs1 följande.",
                         str(page[3]))  # footnote glueing
        self.assertEqual(241, page[3].width)
        self.assertEqual(326, page[3].right)
        self.assertEqual("Härigenom föreskrivs", page[3][0])
        self.assertEqual("1", page[3][1])
        self.assertEqual("sup", page[3][1].tag)
        self.assertEqual(" följande.", page[3][2])
        self.assertEqual("Allmänna bestämmelser",
                         str(page[4]))  # basic encoding,
        self.assertEqual("Times.New.Roman.Fet0100",
                         page[4].font.family)  # font should stay
        self.assertEqual(
            None, page[4][0].tag)  # no tag (font family tells it's bold)
        self.assertEqual("Syftet med lagen", str(page[5]))  # basic encoding,
        self.assertEqual(
            "Times-Roman",
            page[5].font.family)  # font should be changed to default
        self.assertEqual(
            "i", page[5][0].tag
        )  # since this element is <i>, the main font family should not be an italic
        self.assertEqual(
            "1 § Syftet med denna lag är att skydda människor mot att deras personli-",
            str(page[6]))  # non-marked up bold-then-normal textelement
        self.assertEqual("Times-Roman", page[6].font.family)
        self.assertEqual("1 §", page[6][0])
        self.assertEqual("b", page[6][0].tag)
        self.assertEqual(None, page[6][1].tag)

        self.assertEqual(
            "Personuppgiftsansvarig Den som ensam eller tillsammans med andra",
            str(page[8])
        )  # marked up italic/encoded textelement followed by normal/nonencoded
        self.assertEqual("Personuppgiftsansvarig ", page[8][0])
        self.assertEqual("i", page[8][0].tag)
        self.assertEqual(None, page[8][1].tag)

        self.assertEqual(
            "Regeringens bedömning: En lagstiftning som reglerar själva hante-",
            str(page[14]
                ))  # non-marked up bold-then-normal textelement, fixed string
        self.assertEqual("Times-Roman", page[14].font.family)
        self.assertEqual("Regeringens bedömning:", page[14][0])
        self.assertEqual("b", page[14][0].tag)
        self.assertEqual(None, page[14][1].tag)

        self.assertEqual(
            "Datalagskommitténs bedömning överensstämmer med regeringens.",
            str(page[16]
                ))  # non-marked up bold-then-normal textelement, fixed string
        self.assertEqual("Times-Roman", page[16].font.family)
        self.assertEqual("Datalagskommitténs bedömning", page[16][0])
        self.assertEqual("b", page[16][0].tag)
        self.assertEqual(None, page[16][1].tag)

        self.assertEqual(
            "Remissinstanserna: Kammarrätten i Göteborg anser att den registre-",
            str(page[36])
        )  # non-marked up bold-then-normal textelement, fixed string, followed by encoded italics, forcing us to drop back to the default decoding strategy in OffsetDecoder1d
        self.assertEqual("Times-Roman", page[36].font.family)
        self.assertEqual("Remissinstanserna:", page[36][0])
        self.assertEqual("b", page[36][0].tag)
        self.assertEqual(None, page[36][1].tag)
        self.assertEqual("Kammarrätten i Göteborg ", page[36][2])
        self.assertEqual("i", page[36][2].tag)
        self.assertEqual(None, page[36][3].tag)

        self.assertEqual("Landsorganisationen i Sverige (LO)",
                         page[39][0])  # ")" is encoded as TAB
        self.assertEqual("i", page[39][0].tag)
Example #22
0
 def setUp(self):
     self.maxDiff = None
     self.datadir = tempfile.mkdtemp()
     self.reader = PDFReader()