class Analyze(unittest.TestCase): def setUp(self): self.maxDiff = None self.pdf = PDFReader(filename="test/files/pdfanalyze/lipsum.pdf", workdir="test/files/pdfanalyze/") self.analyzer = PDFAnalyzer(self.pdf) def tearDown(self): util.robust_remove("test/files/pdfanalyze/lipsum.metrics.json") util.robust_remove("test/files/pdfanalyze/lipsum.plot.png") util.robust_remove("test/files/pdfanalyze/lipsum.debug.pdf") def test_plot(self): # just test that a plot is created plotpath = "test/files/pdfanalyze/lipsum.plot.png" self.assertFalse(os.path.exists(plotpath)) self.analyzer.metrics(plotpath=plotpath) self.assertTrue(os.path.exists(plotpath)) # reportlab doesn't work with py3.2, current release of pyPDF2 # (1.24) has a py3 bug that crashes page merging (patch exists at # https://github.com/mstamy2/PyPDF2/pull/172) -- but lets skip 3.2 # and try the new pypdf2 1.25 # @unittest.skipIf(sys.version_info > (3, 0, 0), "pyPDF2 not working on py3") def test_drawboxes(self): # just test that a pdf is created pdfpath = "test/files/pdfanalyze/lipsum.debug.pdf" self.assertFalse(os.path.exists(pdfpath)) metrics = self.analyzer.metrics() self.analyzer.drawboxes(pdfpath, metrics=metrics) self.assertTrue(os.path.exists(pdfpath))
class Analyze(unittest.TestCase): def setUp(self): self.maxDiff = None self.pdf = PDFReader(filename="test/files/pdfanalyze/lipsum.pdf", workdir="test/files/pdfanalyze/") self.analyzer = PDFAnalyzer(self.pdf) def test_documents(self): self.assertEqual([(0,3, 'main')], self.analyzer.documents) def test_vcounters(self): vcounters = self.analyzer.count_vertical_margins(0, 3) self.assertEqual(set(vcounters.keys()), set(('bottommargin', 'topmargin', 'pageheight'))) self.assertEqual(max(vcounters['pageheight']), 1262) self.assertEqual(vcounters['bottommargin'][76], 22) # charcount of topmargins from 2 pages self.assertEqual(vcounters['topmargin'][1167], 3) # pagenumbers on 3 pages def test_hcounters(self): hcounters = self.analyzer.count_horizontal_margins(0, 3) self.assertEqual(set(hcounters.keys()), set(('leftmargin', 'rightmargin', 'leftmargin_even', 'rightmargin_even', 'pagewidth'))) self.assertEqual(set(hcounters['leftmargin'].keys()), set((135, 775, 778))) # 775, 778 are pagenumbers on pg 1 + 3 self.assertEqual(list(hcounters['leftmargin_even'].keys()), [108]) self.assertEqual(hcounters['rightmargin'].most_common(1)[0][0], 784) def test_stylecounters(self): stylecounter = self.analyzer.count_styles(1, 2) self.assertEqual(dict(stylecounter), {('Comic Sans MS', 14): 5922, ('Cambria,Bold', 14): 133, ('Cambria,Bold', 17): 128, ('Cambria,Bold', 19): 61}) def test_analyze_hmargins(self): hcounters = self.analyzer.count_horizontal_margins(0, 3) hmetrics = self.analyzer.analyze_horizontal_margins(hcounters) self.assertEqual({'leftmargin': 135, 'leftmargin_even': 108, 'pagewidth': 892, 'rightmargin': 780, 'rightmargin_even': 760}, hmetrics) def test_analyze_vmargins(self): vcounters = self.analyzer.count_vertical_margins(0, 3) vmetrics = self.analyzer.analyze_vertical_margins(vcounters) # this will miscalculate the header zone because the header is # so wordy it's considered part of the main document text self.assertEqual(vmetrics, {'bottommargin': 1149, 'pageheight': 1262, 'topmargin': 53}) # try again with double the thresholds self.analyzer.header_significance_threshold = 0.004 vmetrics = self.analyzer.analyze_vertical_margins(vcounters) self.assertEqual(vmetrics, {'bottommargin': 1149, 'pageheight': 1262, 'topmargin': 107}) def test_analyze_styles(self): stylecounter = self.analyzer.count_styles(1, 3) stylemetrics = self.analyzer.analyze_styles(stylecounter) self.assertEqual({'default': {'family': 'Comic Sans MS', 'size': 14}, 'h1': {'family': 'Cambria,Bold', 'size': 19}, 'h2': {'family': 'Cambria,Bold', 'size': 17}, 'h3': {'family': 'Cambria,Bold', 'size': 14}}, stylemetrics) # this is more of a functional test def test_margins(self): jsonpath = "test/files/pdfanalyze/lipsum.metrics.json" try: self.assertFalse(os.path.exists(jsonpath)) metrics = self.analyzer.metrics(jsonpath, startpage=1) self.assertEqual({'default': {'family': 'Comic Sans MS', 'size': 14}, 'bottommargin': 1149, 'h1': {'family': 'Cambria,Bold', 'size': 19}, 'h2': {'family': 'Cambria,Bold', 'size': 17}, 'h3': {'family': 'Cambria,Bold', 'size': 14}, 'topmargin': 53, 'leftmargin': 135, 'leftmargin_even': 108, 'pageheight': 1262, 'pagewidth': 892, 'rightmargin': 780, 'rightmargin_even': 760, 'scanned_source': False}, metrics) self.assertTrue(os.path.exists(jsonpath)) finally: util.robust_remove(jsonpath) def test_margins_subdocument(self): self.analyzer.frontmatter = 0 # note that this will only analyze a single even page metrics = self.analyzer.metrics(startpage=1, pagecount=1) self.assertEqual({'default': {'family': 'Comic Sans MS', 'size': 14}, 'bottommargin': 1149, 'h1': {'family': 'Cambria,Bold', 'size': 19}, 'h2': {'family': 'Cambria,Bold', 'size': 17}, 'h3': {'family': 'Cambria,Bold', 'size': 14}, 'topmargin': 53, 'leftmargin_even': 108, 'pageheight': 1262, 'pagewidth': 892, 'rightmargin_even': 760, 'scanned_source': False}, metrics) def test_plot(self): matplotmock = MagicMock() mocks = {'matplotlib': matplotmock, 'matplotlib.pyplot': MagicMock()} with patch.dict('sys.modules', mocks): self.analyzer.metrics(plotpath="foo/bar/baz") self.assertTrue(matplotmock.pyplot.savefig.called) def test_drawboxes(self): pypdfmock = MagicMock() canvasmock = MagicMock() mocks = {'PyPDF2': pypdfmock, 'reportlab': MagicMock(), 'reportlab.pdfgen': MagicMock(), 'reportlab.pdfgen.canvas': canvasmock} with patch.dict('sys.modules', mocks): metrics = self.analyzer.metrics() pdfpath = "test/files/pdfanalyze/lipsum.debug.pdf" self.analyzer.drawboxes(pdfpath, metrics=metrics) self.assertTrue(canvasmock.Canvas.called) self.assertTrue(pypdfmock.PdfFileReader.called) self.assertTrue(pypdfmock.PdfFileWriter.called) util.robust_remove(pdfpath)
class Analyze(unittest.TestCase): def setUp(self): self.maxDiff = None self.pdf = PDFReader(filename="test/files/pdfanalyze/lipsum.pdf", workdir="test/files/pdfanalyze/") self.analyzer = PDFAnalyzer(self.pdf) def test_documents(self): self.assertEqual([(0, 3, 'main')], self.analyzer.documents) def test_vcounters(self): vcounters = self.analyzer.count_vertical_margins(0, 3) self.assertEqual(set(vcounters.keys()), set(('bottommargin', 'topmargin', 'pageheight'))) self.assertEqual(max(vcounters['pageheight']), 1262) self.assertEqual(vcounters['bottommargin'][76], 22) # charcount of topmargins from 2 pages self.assertEqual(vcounters['topmargin'][1167], 3) # pagenumbers on 3 pages def test_hcounters(self): hcounters = self.analyzer.count_horizontal_margins(0, 3) self.assertEqual( set(hcounters.keys()), set(('leftmargin', 'rightmargin', 'leftmargin_even', 'rightmargin_even', 'pagewidth'))) self.assertEqual(set(hcounters['leftmargin'].keys()), set((135, 775, 778))) # 775, 778 are pagenumbers on pg 1 + 3 self.assertEqual(list(hcounters['leftmargin_even'].keys()), [108]) self.assertEqual(hcounters['rightmargin'].most_common(1)[0][0], 784) def test_stylecounters(self): stylecounter = self.analyzer.count_styles(1, 2) self.assertEqual( dict(stylecounter), { ('Comic Sans MS', 14): 5922, ('Cambria,Bold', 14): 133, ('Cambria,Bold', 17): 128, ('Cambria,Bold', 19): 61 }) def test_analyze_hmargins(self): hcounters = self.analyzer.count_horizontal_margins(0, 3) hmetrics = self.analyzer.analyze_horizontal_margins(hcounters) self.assertEqual( { 'leftmargin': 135, 'leftmargin_even': 108, 'pagewidth': 892, 'rightmargin': 780, 'rightmargin_even': 760 }, hmetrics) def test_analyze_vmargins(self): vcounters = self.analyzer.count_vertical_margins(0, 3) vmetrics = self.analyzer.analyze_vertical_margins(vcounters) # this will miscalculate the header zone because the header is # so wordy it's considered part of the main document text self.assertEqual(vmetrics, { 'bottommargin': 1149, 'pageheight': 1262, 'topmargin': 53 }) # try again with double the thresholds self.analyzer.header_significance_threshold = 0.004 vmetrics = self.analyzer.analyze_vertical_margins(vcounters) self.assertEqual(vmetrics, { 'bottommargin': 1149, 'pageheight': 1262, 'topmargin': 107 }) def test_analyze_styles(self): stylecounter = self.analyzer.count_styles(1, 3) stylemetrics = self.analyzer.analyze_styles(stylecounter) self.assertEqual( { 'default': { 'family': 'Comic Sans MS', 'size': 14 }, 'h1': { 'family': 'Cambria,Bold', 'size': 19 }, 'h2': { 'family': 'Cambria,Bold', 'size': 17 }, 'h3': { 'family': 'Cambria,Bold', 'size': 14 } }, stylemetrics) # this is more of a functional test def test_margins(self): jsonpath = "test/files/pdfanalyze/lipsum.metrics.json" try: self.assertFalse(os.path.exists(jsonpath)) metrics = self.analyzer.metrics(jsonpath, startpage=1) self.assertEqual( { 'default': { 'family': 'Comic Sans MS', 'size': 14 }, 'bottommargin': 1149, 'h1': { 'family': 'Cambria,Bold', 'size': 19 }, 'h2': { 'family': 'Cambria,Bold', 'size': 17 }, 'h3': { 'family': 'Cambria,Bold', 'size': 14 }, 'topmargin': 53, 'leftmargin': 135, 'leftmargin_even': 108, 'pageheight': 1262, 'pagewidth': 892, 'rightmargin': 780, 'rightmargin_even': 760, 'scanned_source': False }, metrics) self.assertTrue(os.path.exists(jsonpath)) finally: util.robust_remove(jsonpath) def test_margins_subdocument(self): self.analyzer.frontmatter = 0 # note that this will only analyze a single even page metrics = self.analyzer.metrics(startpage=1, pagecount=1) self.assertEqual( { 'default': { 'family': 'Comic Sans MS', 'size': 14 }, 'bottommargin': 1149, 'h1': { 'family': 'Cambria,Bold', 'size': 19 }, 'h2': { 'family': 'Cambria,Bold', 'size': 17 }, 'h3': { 'family': 'Cambria,Bold', 'size': 14 }, 'topmargin': 53, 'leftmargin_even': 108, 'pageheight': 1262, 'pagewidth': 892, 'rightmargin_even': 760, 'scanned_source': False }, metrics) def test_plot(self): matplotmock = MagicMock() mocks = {'matplotlib': matplotmock, 'matplotlib.pyplot': MagicMock()} with patch.dict('sys.modules', mocks): self.analyzer.metrics(plotpath="foo/bar/baz") self.assertTrue(matplotmock.pyplot.savefig.called) def test_drawboxes(self): pypdfmock = MagicMock() canvasmock = MagicMock() mocks = { 'PyPDF2': pypdfmock, 'reportlab': MagicMock(), 'reportlab.pdfgen': MagicMock(), 'reportlab.pdfgen.canvas': canvasmock } with patch.dict('sys.modules', mocks): metrics = self.analyzer.metrics() pdfpath = "test/files/pdfanalyze/lipsum.debug.pdf" self.analyzer.drawboxes(pdfpath, metrics=metrics) self.assertTrue(canvasmock.Canvas.called) self.assertTrue(pypdfmock.PdfFileReader.called) self.assertTrue(pypdfmock.PdfFileWriter.called) util.robust_remove(pdfpath)