def _graft_text_layer(*, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(text).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file pdf_text = pikepdf.open(text) pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW rotation = -rotation % 360 rotate = pikepdf.PdfMatrix().rotated(rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner pdf_text_contents = b'q %s cm\n' % ctm.encode( ) + pdf_text_contents + b'\nQ\n' new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents) if strip_old_text: strip_invisible_text(pdf_base, base_page) base_page.page_contents_add(new_text_layer, prepend=True) _update_page_resources(page=base_page, font=font, font_key=font_key, procset=procset) pdf_text.close()
def _graft_text_layer( self, *, page_num: int, textpdf: Path, font: pikepdf.Object, font_key: pikepdf.Object, procset: pikepdf.Object, text_rotation: int, strip_old_text: bool, ): """Insert the text layer from text page 0 on to pdf_base at page_num""" if Path(textpdf).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file with pikepdf.open(textpdf) as pdf_text: pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = self.pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW text_rotation = -text_rotation % 360 rotate = pikepdf.PdfMatrix().rotated(text_rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if text_rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner base_resources = _ensure_dictionary(base_page, Name.Resources) base_xobjs = _ensure_dictionary(base_resources, Name.XObject) text_xobj_name = Name("/" + str(uuid.uuid4())) xobj = self.pdf_base.make_stream(pdf_text_contents) base_xobjs[text_xobj_name] = xobj xobj.Type = Name.XObject xobj.Subtype = Name.Form xobj.FormType = 1 xobj.BBox = mediabox _update_resources( obj=xobj, font=font, font_key=font_key, procset=[Name.PDF] ) pdf_draw_xobj = ( (b"q %s cm\n" % ctm.encode()) + (b"%s Do\n" % text_xobj_name) + b"\nQ\n" ) new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj) if strip_old_text: strip_invisible_text(self.pdf_base, base_page) base_page.page_contents_add(new_text_layer, prepend=True) _update_resources( obj=base_page, font=font, font_key=font_key, procset=procset )
def _weave_layers_graft(*, pdf_base, page_num, text, font, font_key, procset, rotation, log): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(text).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file pdf_text = pikepdf.open(text) pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() if not tesseract.has_textonly_pdf(): # If we don't have textonly_pdf, edit the stream to delete the # instruction to draw the image Tesseract generated, which we do not # use. stream = bytearray(pdf_text_contents) pattern = b'/Im1 Do' idx = stream.find(pattern) stream[idx:(idx + len(pattern))] = b' ' * len(pattern) pdf_text_contents = bytes(stream) base_page = pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) # -rotation because the input is a clockwise angle and this formula # uses CCW rotation = -rotation % 360 rotate = pikepdf.PdfMatrix().rotated(rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht log.debug('%r', (scale_x, scale_y)) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate ctm = translate @ rotate @ scale @ untranslate pdf_text_contents = (b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n') new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents) base_page.page_contents_add(new_text_layer, prepend=True) _update_page_resources(page=base_page, font=font, font_key=font_key, procset=procset)