def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = Page(page) rich_page.contents_coalesce() for operands, operator in parse_content_stream(page, ''): if not in_text_obj: if operator == Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() content_stream = unparse_content_stream(stream) page.Contents = Stream(pdf, content_stream)
def test_filter_names(pal): page = Page(pal.pages[0]) filter = FilterCollectNames() result = page.get_filtered_contents(filter) assert result == b'' assert filter.names == ['/Im0'] after = page.obj.Contents.read_bytes() assert after != b''
def test_page_index(fourpages): for n, page in enumerate(fourpages.pages): assert Page(page).index == n assert fourpages.pages.index(page) == n del fourpages.pages[1] for n, page in enumerate(fourpages.pages): assert Page(page).index == n assert fourpages.pages.index(page) == n
def test_formx(graph, outpdf): formx = Page(graph.pages[0]).as_form_xobject() graph.add_blank_page() new_page = Page(graph.pages[-1]) formx_placed_name = new_page.add_resource(formx, Name.XObject) cs = new_page.calc_form_xobject_placement(formx, formx_placed_name, Rectangle(0, 0, 200, 200)) assert bytes(formx_placed_name) in cs new_page.obj.Contents = graph.make_stream(cs) graph.save(outpdf)
def test_page_contents_add(graph, outdir): pdf = graph mat = PdfMatrix().rotated(45) stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm') stream2 = Stream(pdf, b'Q') Page(pdf.pages[0]).contents_add(stream1, True) Page(pdf.pages[0]).contents_add(stream2, False) pdf.save(outdir / 'out.pdf') with pytest.raises(TypeError, match="Not a Page"): with pytest.deprecated_call(): Array([42]).page_contents_add(stream1)
def test_page_index_foreign_page(fourpages, sandwich): with pytest.raises(ValueError, match="Page is not in this Pdf"): fourpages.pages.index(sandwich.pages[0]) p3 = fourpages.pages[2] assert Page(p3).index == 2 fourpages.pages.insert(2, sandwich.pages[0]) assert Page(fourpages.pages[2]).index == 2 assert Page(p3).index == 3 assert fourpages.pages.index(p3) == 3 assert fourpages.pages.index(Page(p3)) == 3 with pytest.raises(ValueError, match="Page is not in this Pdf"): # sandwich.pages[0] is still not "in" fourpages; it gets copied into it assert fourpages.pages.index(sandwich.pages[0])
def test_page_labels(): p = Pdf.new() d = Dictionary(Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary()) for n in range(5): p.pages.append(d) p.pages[n].Contents = Stream(p, b"BT (Page %s) Tj ET" % str(n).encode()) p.Root.PageLabels = p.make_indirect( Dictionary( Nums=Array( [ 0, # new label rules begin at index 0 Dictionary(S=Name.r), # use lowercase roman numerals, until... 2, # new label rules begin at index 2 Dictionary( S=Name.D, St=42, P='Prefix-' ), # label pages as 'Prefix-42', 'Prefix-43', ... ] ) ) ) labels = ['i', 'ii', 'Prefix-42', 'Prefix-43', 'Prefix-44'] for n in range(5): rawpage = p.pages[n] page = Page(rawpage) assert page.label == labels[n]
def test_unattached_page(): rawpage = Dictionary( Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary() ) page = Page(rawpage) with pytest.raises(ValueError, match='not attached'): page.index with pytest.raises(ValueError, match='not attached'): page.label
def test_fourpages_to_4up(fourpages, graph, outpdf): pdf = Pdf.new() pdf.add_blank_page(page_size=(1000, 1000)) page = pdf.pages[0] pdf.pages.extend(fourpages.pages) # Keep explicit Page(pdf.pages[..]) here page.add_overlay(pdf.pages[1], Rectangle(0, 500, 500, 1000)) page.add_overlay(Page(pdf.pages[2]), Rectangle(500, 500, 1000, 1000)) page.add_overlay(Page(pdf.pages[3]).as_form_xobject(), Rectangle(0, 0, 500, 500)) page.add_underlay(pdf.pages[4], Rectangle(500, 0, 1000, 500)) page.add_underlay(graph.pages[0]) with pytest.raises(TypeError): page.add_overlay(Dictionary(Key=123)) del pdf.pages[1:] pdf.save(outpdf)
def __str__(self): if self.children: if self.is_closed: oc_indicator = '[+]' else: oc_indicator = '[-]' else: oc_indicator = '[ ]' if self.destination is not None: raw_page = self.destination[0] page = Page(raw_page) dest = page.label else: dest = '<Action>' return f'{oc_indicator} {self.title} -> {dest}'
def test_issue160_tokenfilter_refcounting(resources, outpdf): # Ensure that add_content_token_filter properly "remembers" token filters # that are not needed until .save() class MyFilter(TokenFilter): def __init__(self, replace): super().__init__() self.replace = bytes(replace, 'ascii') def handle_token(self, tok): if tok.type_ == TokenType.string: l = len(tok.raw_value) s = self.replace * l return Token(TokenType.string, s) return tok with Pdf.open(resources / 'outlines.pdf') as pdf: pages = pdf.pages num = 0 for page in pages: page = Page(page) f = MyFilter(('%d' % num)[-1]) page.add_content_token_filter(f) num += 1 pdf.save(outpdf)
def test_externalize(resources): with Pdf.open(resources / 'image-mono-inline.pdf') as p: page = Page(p.pages[0]) page.contents_coalesce() assert b'BI' in page.obj.Contents.read_bytes(), "no inline image" assert Name.XObject not in page.obj.Resources, "expected no xobjs" page.externalize_inline_images() assert Name.XObject in page.obj.Resources, "image not created" pdfimagexobj = next(iter(p.pages[0].images.values())) assert pdfimagexobj.Subtype == Name.Image assert page.label == '1'
def _graft_text_layer( self, *, page_num: int, textpdf: Path, font: Object, font_key: Object, procset: Object, text_rotation: int, strip_old_text: bool, ): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(textpdf).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file with Pdf.open(textpdf) as pdf_text: pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = self.pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = PdfMatrix().translated(wp / 2, hp / 2) corner = PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW text_rotation = -text_rotation % 360 rotate = PdfMatrix().rotated(text_rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if text_rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner base_resources = _ensure_dictionary(base_page, Name.Resources) base_xobjs = _ensure_dictionary(base_resources, Name.XObject) text_xobj_name = Name('/' + str(uuid.uuid4())) xobj = self.pdf_base.make_stream(pdf_text_contents) base_xobjs[text_xobj_name] = xobj xobj.Type = Name.XObject xobj.Subtype = Name.Form xobj.FormType = 1 xobj.BBox = mediabox _update_resources(obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]) pdf_draw_xobj = ((b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n') new_text_layer = Stream(self.pdf_base, pdf_draw_xobj) if strip_old_text: strip_invisible_text(self.pdf_base, base_page) if hasattr(Page, 'contents_add'): # pikepdf >= 2.14 adds this method and deprecates the one below Page(base_page).contents_add(new_text_layer, prepend=True) else: # pikepdf < 2.14 base_page.page_contents_add(new_text_layer, prepend=True) # pragma: no cover _update_resources(obj=base_page, font=font, font_key=font_key, procset=procset)
def test_unindexed_page(graph): rawpage = graph.pages[0] page = Page(rawpage) del graph.pages[0] with pytest.raises(ValueError, match='not consistently registered'): page.index
def graph_page(graph): return Page(graph.pages[0])
def test_filter_thru(pal, filter, expected): page = Page(pal.pages[0]) page.add_content_token_filter(filter()) after = page.obj.Contents.read_bytes() assert after == expected
def test_invalid_handle_token(pal): page = Page(pal.pages[0]) with pytest.raises((TypeError, PdfError)): page.get_filtered_contents(FilterInvalid())
def test_has_text(resources, test_file, expected): pdf = Pdf.open(resources / test_file) for p in pdf.pages: page = Page(p) assert page.has_text() == expected
def test_tokenfilter_is_abstract(pal): page = Page(pal.pages[0]) with pytest.raises((RuntimeError, PdfError)): page.get_filtered_contents(TokenFilter())
def test_invalid_tokenfilter(pal): page = Page(pal.pages[0]) with pytest.raises(TypeError): page.get_filtered_contents(list())