def test_1d_encoding(self): # since version 0.57.0, pdftohtml silently discards some # invalid characters from output. Problem is, we have a font # with custom encoding, in wich 0x03 (ctrl-c) represent # space. pdftohtml 0.57+ drops this character before we get a # chance to read and decode it. Until we can do something # about recent pdftohtml versions, we make sure that this test # uses pre-generated output from an older version. from ferenda.sources.legal.se.decoders import OffsetDecoder1d #try: # reader = PDFReader(filename="test/files/pdfreader/custom-encoding.pdf", # workdir=self.datadir, # textdecoder=OffsetDecoder1d()) #except errors.ExternalCommandError as e: # print("test_custom_encoding got ExternalCommandError %s, copying sample and retrying" % e) self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/custom-encoding.pdf", workdir=self.datadir, textdecoder=OffsetDecoder1d()) # textbox 5 and 6 uses a font with a custom encoding, make # sure that this is properly decoded. tbs = list(reader.textboxes()) self.assertEqual("Göran Persson", str(tbs[5])) self.assertEqual("Bosse Ringholm", str(tbs[6])) self.assertEqual("(Finansdepartementet)", str(tbs[7]))
def test_bz2(self): try: reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir, keep_xml="bz2") except errors.ExternalCommandError: self._copy_sample() # bzip2 our canned sample.xml with open(self.datadir + os.sep + "sample.xml", "rb") as rfp: wfp = BZ2File(self.datadir + os.sep + "sample.xml.bz2", "wb") wfp.write(rfp.read()) wfp.close() os.unlink(self.datadir + os.sep + "sample.xml") reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir, keep_xml="bz2") # a temporary copy of the pdf file should not be lying around in workdir self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf")) # but the XML file (only in bz2 format) should be stored self.assertTrue( os.path.exists(self.datadir + os.sep + "sample.xml.bz2")) self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.xml")) # first page, first box self.assertEqual("Document title ", str(reader[0][0])) # parsing again should reuse the existing sample.xml.bz2 reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir, keep_xml="bz2")
def parse(self, doc): doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'), self.generic_path(doc.basefile, 'downloaded', '.docx'), self.generic_path(doc.basefile, 'downloaded', '.wpd'), self.generic_path(doc.basefile, 'downloaded', '.rtf')) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.generic_path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.generic_path(doc.basefile, 'intermediate', '.foo')) self.setup_logger('pdfreader', self.config.get('log', 'INFO')) pdfreader = PDFReader() pdfreader.read(pdffile, intermediate_dir) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.downloaded_path(doc.basefile) intermediate_path = self.generic_path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc)
class Read(unittest.TestCase): def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.reader = PDFReader() def tearDown(self): shutil.rmtree(self.datadir) def test_basic(self): try: self.reader.read("test/files/pdfreader/sample.pdf", self.datadir) except errors.ExternalCommandError: for fname in os.listdir("test/files/pdfreader/intermediate"): to = fname.replace("index", "sample") shutil.copy("test/files/pdfreader/intermediate/%s" % fname, self.datadir + os.sep + to) self.reader.read("test/files/pdfreader/sample.pdf", self.datadir) self.assertEqual(len(self.reader), 1) # first page, first box title = str(self.reader[0][0]) self.assertEqual("Document title ", title) self.assertEqual(318, self.reader.median_box_width()) page = self.reader[0] self.assertEqual("Page 1 (892 x 1263): 'Document title This is a simple documen...'", str(page)) # an uncropped doc should have two textboxes self.assertEqual(2, len(list(page.boundingbox()))) # a smaller bounding box yields just one self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460)))) # cropping it with the same dimensions page.crop(190, 130, 230, 460) # should also result in just one box -- the bottom one boxes = list(page.boundingbox()) self.assertEqual(1, len(boxes)) box = boxes[0] self.assertEqual("This is a simple document in PDF format. ", str(box)) self.assertEqual({'color': '#000000', 'size': '16', 'id': '1', 'family': 'Times'}, box.getfont()) # this box should have four text elements self.assertEqual(4, len(box)) self.assertEqual(None, box[0].tag) self.assertEqual("i", box[1].tag) self.assertEqual("ib", box[2].tag) self.assertEqual(None, box[3].tag)
def pdfreader_from_basefile(self, basefile): pdffile = self.store.downloaded_path(basefile) # Convoluted way of getting the directory of the intermediate # xml + png files that PDFReader will create intermediate_dir = os.path.dirname(self.store.intermediate_path(basefile)) pdf = PDFReader() pdf.read(pdffile, intermediate_dir) return pdf
def test_fallback_ocr(self): try: # actually running tesseract takes ages -- for day-to-day # testing we can just as well use the canned hocr.html # files that _copy_sample fixes for us. if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader( filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader( filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) self.assertTrue(reader.is_empty()) reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, ocr_lang="eng") self.assertFalse(reader.is_empty()) self.assertEqual(2, len(reader)) self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION", util.normalize_space(str(reader[0][1])))
def _parse_xml(self, xmlfrag, decoding_class=BaseTextDecoder): pdf = PDFReader(pages=True) pdf.fontspec = {} pdf._textdecoder = decoding_class() xml = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd"> <pdf2xml producer="poppler" version="0.24.3"> <page number="1" position="absolute" top="0" left="0" height="750" width="500"> %s </page> </pdf2xml>""" % xmlfrag xmlfp = BytesIO(xml.encode("utf-8")) xmlfp.name = "dummy.xml" pdf._parse_xml(xmlfp) return pdf
def test_dontkeep(self): self.assertFalse( os.path.exists(self.datadir + os.sep + "sample.xml.bz2")) try: reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir, keep_xml=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir, keep_xml=False) # No XML file should exist self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.xml")) self.assertFalse( os.path.exists(self.datadir + os.sep + "sample.xml.bz2"))
def setUp(self): # create a mock analyzer analyzer = PDFAnalyzer(None) analyzer.scanned_source = self.scanned_source self.gluefunc = Offtryck().get_gluefunc('basefile', analyzer) self.pdfreader = PDFReader() self.pdfreader.fontspec = {} self.pdfreader._textdecoder = lambda x, y: x self.pdfreader._textdecoder.fontspec = lambda x: x
def setUp(self): # create a mock analyzer analyzer = PDFAnalyzer(None) analyzer.scanned_source = False self.gluefunc = Offtryck().get_gluefunc('basefile', analyzer) self.pdfreader = PDFReader() self.pdfreader.fontspec = {} from ferenda.sources.legal.se.decoders import OffsetDecoder20 self.pdfreader._textdecoder = OffsetDecoder20()
def test_json_roundtrip(self): # a more realistic roundtrip example with some hairy parts from ferenda import PDFDocumentRepository, PDFReader d = PDFDocumentRepository() doc = d.make_document("sample") # make SURE that the intermediate files are newer than the pdf os.utime("test/files/pdfreader/intermediate/sample.xml", None) reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir="test/files/pdfreader/intermediate") d.parse_from_pdfreader(reader, doc) jsondoc = serialize(doc, format="json") newdoc = deserialize(jsondoc, format="json") self.assertEqual(doc, newdoc)
def test_fallback_ocr(self): try: # actually running tesseract takes ages -- for day-to-day # testing we can just as well use the canned hocr.html # files that _copy_sample fixes for us. if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) self.assertTrue(reader.is_empty()) reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, ocr_lang="eng") self.assertFalse(reader.is_empty()) self.assertEqual(2, len(reader)) self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION", util.normalize_space(str(reader[0][1])))
def test_ocr(self): try: if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader(filename="test/files/pdfreader/scanned.pdf", workdir=self.datadir, ocr_lang="swe") except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/scanned.pdf", workdir=self.datadir, ocr_lang="swe") # assert that a hOCR file has been created self.assertTrue( os.path.exists(self.datadir + os.sep + "scanned.hocr.html")) # assert that we have two pages self.assertEqual(2, len(reader)) # assert that first element in the first textbox in the first # page corresponds to the first bbox, scaled by the # pixel/point scaling factor. self.assertEqual("Regeringens ", str(reader[0][0][0])) self.assertEqual(47, reader[0][0][0].top) self.assertEqual(38, reader[0][0][0].left) self.assertEqual(21, reader[0][0][0].height) self.assertEqual(118, reader[0][0][0].width) # assert that the <s>third</s>fifth textbox (which has mostly # normal text) is rendered correctly (note that we have a # couple of OCR errors). # self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i bifogade utdrag ur regeringsprotokollet den 31 oktober l99l.", util.normalize_space(str(reader[0][3]))) self.assertEqual( "Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i", util.normalize_space(str(reader[0][5])))
def pdfreader_from_basefile(self, basefile): pdffile = self.store.downloaded_path(basefile) # Convoluted way of getting the directory of the intermediate # xml + png files that PDFReader will create intermediate_dir = os.path.dirname( self.store.intermediate_path(basefile)) if self.config.compress == "bz2": keep_xml = "bz2" else: keep_xml = True pdf = PDFReader(filename=pdffile, workdir=intermediate_dir, images=self.config.pdfimages, keep_xml=keep_xml) return pdf
def test_autodetect_encoding(self): from ferenda.sources.legal.se.decoders import DetectingDecoder self._copy_sample() reader = PDFReader( filename="test/files/pdfreader/multiple-encodings.pdf", workdir=self.datadir, textdecoder=DetectingDecoder()) page = reader[0] self.assertEqual( "Detta är helt vanlig icke-kodad text på svenska.", str(page[0])) # unencoded (but marked as Custom encoding) self.assertEqual( "mellan Konungariket Sveriges regering och Konungariket Danmarks", str(page[1])) # basic encoding (0x1d) self.assertEqual("Skälen för regeringens bedömning och förslag", str(page[2])) # other encoding (0x20
def setUp(self): self.maxDiff = None self.pdf = PDFReader(filename="test/files/pdfanalyze/lipsum.pdf", workdir="test/files/pdfanalyze/") self.analyzer = PDFAnalyzer(self.pdf)
def test_basic(self): try: reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) # a temporary copy of the pdf file should not be lying around # in workdir # print("Checking if %s has been unlinked" % (self.datadir + # os.sep + "sample.pdf")) self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf")) # but the XML file should be stored for subsequent parses self.assertTrue(os.path.exists(self.datadir + os.sep + "sample.xml")) # The PDF contained actual textboxes self.assertFalse(reader.is_empty()) self.assertEqual(len(reader), 1) # first page, first box title = str(reader[0][0]) self.assertEqual("Document title ", title) self.assertEqual(570, reader.median_box_width()) page = reader[0] self.assertEqual("Page 1 (892 x 1263): 'Document title This is a simple documen...'", str(page)) # an uncropped doc should have nine nonempty textboxes self.assertEqual(9, len(list(page.boundingbox()))) # a smaller bounding box yields just one self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460)))) # cropping it with the same dimensions # NOTE: This will fail if convert (from imagemagick) isn't installed) try: page.crop(190, 130, 230, 460) except errors.ExternalCommandError: # the rest of the tests cannot succeed now. FIXME: We # should try to find a way to run them anyway return # should also result in just one box -- the bottom one boxes = list(page.boundingbox()) self.assertEqual(1, len(boxes)) box = boxes[0] self.assertEqual("This is a simple document in PDF format. ", str(box)) self.assertEqual('#000000', box.font.color) self.assertEqual(16, box.font.size) self.assertEqual('1', box.font.id) self.assertEqual('Cambria', box.font.family) # this box should have four text elements self.assertEqual(4, len(box)) self.assertEqual(None, box[0].tag) self.assertEqual("i", box[1].tag) self.assertEqual("ib", box[2].tag) self.assertEqual(None, box[3].tag)
def test_basic(self): try: reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) # a temporary copy of the pdf file should not be lying around # in workdir # print("Checking if %s has been unlinked" % (self.datadir + # os.sep + "sample.pdf")) self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf")) # but the XML file should be stored for subsequent parses self.assertTrue(os.path.exists(self.datadir + os.sep + "sample.xml")) # The PDF contained actual textboxes self.assertFalse(reader.is_empty()) self.assertEqual(len(reader), 1) # first page, first box title = str(reader[0][0]) self.assertEqual("Document title ", title) self.assertEqual(570, reader.median_box_width()) page = reader[0] self.assertEqual( "Page 1 (892 x 1263): 'Document title This is a simple documen...'", str(page)) # an uncropped doc should have nine nonempty textboxes self.assertEqual(9, len(list(page.boundingbox()))) # a smaller bounding box yields just one self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460)))) # cropping it with the same dimensions # NOTE: This will fail if convert (from imagemagick) isn't installed) try: page.crop(190, 130, 230, 460) except errors.ExternalCommandError: # the rest of the tests cannot succeed now. FIXME: We # should try to find a way to run them anyway return # should also result in just one box -- the bottom one boxes = list(page.boundingbox()) self.assertEqual(1, len(boxes)) box = boxes[0] self.assertEqual("This is a simple document in PDF format. ", str(box)) self.assertEqual('#000000', box.font.color) self.assertEqual(16, box.font.size) self.assertEqual('1', box.font.id) self.assertEqual('Cambria', box.font.family) # this box should have four text elements self.assertEqual(4, len(box)) self.assertEqual(None, box[0].tag) self.assertEqual("i", box[1].tag) self.assertEqual("ib", box[2].tag) self.assertEqual(None, box[3].tag)
def test_20_encoding(self): # for this file, we don't even have a real PDF file, just some # copypasted excerpts from an intermediate XML file from ferenda.sources.legal.se.decoders import OffsetDecoder20 self._copy_sample() reader = PDFReader( filename="test/files/pdfreader/prop_1997_98_44.pdf", workdir=self.datadir, textdecoder=OffsetDecoder20(kommittenamn="Datalagskommittén")) page = reader[0] self.assertEqual("Personuppgiftslag", str(page[0])) # unencoded self.assertEqual("Laila Freivalds", str(page[1])) # basic encoding self.assertEqual("Pierre Schori", str(page[2])) # basic encoding self.assertEqual("Härigenom föreskrivs1 följande.", str(page[3])) # footnote glueing self.assertEqual(241, page[3].width) self.assertEqual(326, page[3].right) self.assertEqual("Härigenom föreskrivs", page[3][0]) self.assertEqual("1", page[3][1]) self.assertEqual("sup", page[3][1].tag) self.assertEqual(" följande.", page[3][2]) self.assertEqual("Allmänna bestämmelser", str(page[4])) # basic encoding, self.assertEqual("Times.New.Roman.Fet0100", page[4].font.family) # font should stay self.assertEqual( None, page[4][0].tag) # no tag (font family tells it's bold) self.assertEqual("Syftet med lagen", str(page[5])) # basic encoding, self.assertEqual( "Times-Roman", page[5].font.family) # font should be changed to default self.assertEqual( "i", page[5][0].tag ) # since this element is <i>, the main font family should not be an italic self.assertEqual( "1 § Syftet med denna lag är att skydda människor mot att deras personli-", str(page[6])) # non-marked up bold-then-normal textelement self.assertEqual("Times-Roman", page[6].font.family) self.assertEqual("1 §", page[6][0]) self.assertEqual("b", page[6][0].tag) self.assertEqual(None, page[6][1].tag) self.assertEqual( "Personuppgiftsansvarig Den som ensam eller tillsammans med andra", str(page[8]) ) # marked up italic/encoded textelement followed by normal/nonencoded self.assertEqual("Personuppgiftsansvarig ", page[8][0]) self.assertEqual("i", page[8][0].tag) self.assertEqual(None, page[8][1].tag) self.assertEqual( "Regeringens bedömning: En lagstiftning som reglerar själva hante-", str(page[14] )) # non-marked up bold-then-normal textelement, fixed string self.assertEqual("Times-Roman", page[14].font.family) self.assertEqual("Regeringens bedömning:", page[14][0]) self.assertEqual("b", page[14][0].tag) self.assertEqual(None, page[14][1].tag) self.assertEqual( "Datalagskommitténs bedömning överensstämmer med regeringens.", str(page[16] )) # non-marked up bold-then-normal textelement, fixed string self.assertEqual("Times-Roman", page[16].font.family) self.assertEqual("Datalagskommitténs bedömning", page[16][0]) self.assertEqual("b", page[16][0].tag) self.assertEqual(None, page[16][1].tag) self.assertEqual( "Remissinstanserna: Kammarrätten i Göteborg anser att den registre-", str(page[36]) ) # non-marked up bold-then-normal textelement, fixed string, followed by encoded italics, forcing us to drop back to the default decoding strategy in OffsetDecoder1d self.assertEqual("Times-Roman", page[36].font.family) self.assertEqual("Remissinstanserna:", page[36][0]) self.assertEqual("b", page[36][0].tag) self.assertEqual(None, page[36][1].tag) self.assertEqual("Kammarrätten i Göteborg ", page[36][2]) self.assertEqual("i", page[36][2].tag) self.assertEqual(None, page[36][3].tag) self.assertEqual("Landsorganisationen i Sverige (LO)", page[39][0]) # ")" is encoded as TAB self.assertEqual("i", page[39][0].tag)
def setUp(self): self.maxDiff = None self.datadir = tempfile.mkdtemp() self.reader = PDFReader()