Example #1
0
def test_page_contents_add(resources, outdir):
    pdf = Pdf.open(resources / 'graph.pdf')

    mat = PdfMatrix().rotated(45)

    stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm')
    stream2 = Stream(pdf, b'Q')

    pdf.pages[0].page_contents_add(stream1, True)
    pdf.pages[0].page_contents_add(stream2, False)
    pdf.save(outdir / 'out.pdf')
Example #2
0
def _process_content_streams(
        *,
        pdf: Pdf,
        container: Object,
        shorthand=None
) -> Iterator[Union[VectorMarker, TextMarker, ImageInfo]]:
    """Find all individual instances of images drawn in the container

    Usually the container is a page, but it may also be a Form XObject.

    On a typical page images are stored inline or as regular images
    in an XObject.

    Form XObjects may include inline images, XObject images,
    and recursively, other Form XObjects; and also vector graphic objects.

    Every instance of an image being drawn somewhere is flattened and
    treated as a unique image, since if the same image is drawn multiple times
    on one page it may be drawn at differing resolutions, and our objective
    is to find the resolution at which the page can be rastered without
    downsampling.

    """

    if container.get('/Type') == '/Page' and '/Contents' in container:
        initial_shorthand = shorthand or UNIT_SQUARE
    elif container.get(
            '/Type') == '/XObject' and container['/Subtype'] == '/Form':
        # Set the CTM to the state it was when the "Do" operator was
        # encountered that is drawing this instance of the Form XObject
        ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()

        # A Form XObject may provide its own matrix to map form space into
        # user space. Get this if one exists
        form_shorthand = container.get('/Matrix', PdfMatrix.identity())
        form_matrix = PdfMatrix(form_shorthand)

        # Concatenate form matrix with CTM to ensure CTM is correct for
        # drawing this instance of the XObject
        ctm = form_matrix @ ctm
        initial_shorthand = ctm.shorthand
    else:
        return

    contentsinfo = _interpret_contents(container, initial_shorthand)

    if contentsinfo.found_vector:
        yield VectorMarker()
    if contentsinfo.found_text:
        yield TextMarker()
    yield from _find_inline_images(contentsinfo)
    yield from _find_regular_images(container, contentsinfo)
    yield from _find_form_xobject_images(pdf, container, contentsinfo)
Example #3
0
def test_page_contents_add(graph, outdir):
    pdf = graph

    mat = PdfMatrix().rotated(45)

    stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm')
    stream2 = Stream(pdf, b'Q')

    pdf.pages[0].page_contents_add(stream1, True)
    pdf.pages[0].page_contents_add(stream2, False)
    pdf.save(outdir / 'out.pdf')

    with pytest.raises(TypeError, match="Not a Page"):
        Array([42]).page_contents_add(stream1)
Example #4
0
def test_unparse_cs():
    instructions = [
        ([], Operator('q')),
        ([*PdfMatrix.identity().shorthand], Operator('cm')),
        ([], Operator('Q')),
    ]
    assert unparse_content_stream(instructions).strip() == b'q\n1 0 0 1 0 0 cm\n Q'
Example #5
0
def _simple_interpret_content_stream(page: Union[Page, Object]):
    ctm = PdfMatrix.identity()
    stack: List[PdfMatrix] = []
    for instruction in parse_content_stream(page, operators='q Q cm Do'):
        if isinstance(instruction, ContentStreamInlineImage):
            continue
        operands, op = instruction.operands, instruction.operator
        if op == Operator('q'):
            stack.append(ctm)
        elif op == Operator('Q'):
            ctm = stack.pop()
        elif op == Operator('cm'):
            ctm = PdfMatrix(operands) @ ctm
        elif op == Operator('Do'):
            xobj_name = operands[0]
            yield (xobj_name, ctm)
Example #6
0
def _process_content_streams(*, pdf, container, shorthand=None):
    """Find all individual instances of images drawn in the container

    Usually the container is a page, but it may also be a Form XObject.

    On a typical page images are stored inline or as regular images
    in an XObject.

    Form XObjects may include inline images, XObject images,
    and recursively, other Form XObjects; and also vector graphic objects.

    Every instance of an image being drawn somewhere is flattened and
    treated as a unique image, since if the same image is drawn multiple times
    on one page it may be drawn at differing resolutions, and our objective
    is to find the resolution at which the page can be rastered without
    downsampling.

    """

    if container.get('/Type') == '/Page' and '/Contents' in container:
        initial_shorthand = shorthand or UNIT_SQUARE
    elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form':
        # Set the CTM to the state it was when the "Do" operator was
        # encountered that is drawing this instance of the Form XObject
        ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()

        # A Form XObject may provide its own matrix to map form space into
        # user space. Get this if one exists
        form_shorthand = container.get('/Matrix', PdfMatrix.identity())
        form_matrix = PdfMatrix(form_shorthand)

        # Concatenate form matrix with CTM to ensure CTM is correct for
        # drawing this instance of the XObject
        ctm = form_matrix @ ctm
        initial_shorthand = ctm.shorthand
    else:
        return

    contentsinfo = _interpret_contents(container, initial_shorthand)

    if contentsinfo.found_vector:
        yield VectorInfo()
    yield from _find_inline_images(contentsinfo)
    yield from _find_regular_images(container, contentsinfo)
    yield from _find_form_xobject_images(pdf, container, contentsinfo)
Example #7
0
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    """Interpret the PDF content stream.

    The stack represents the state of the PDF graphics stack.  We are only
    interested in the current transformation matrix (CTM) so we only track
    this object; a full implementation would need to track many other items.

    The CTM is initialized to the mapping from user space to device space.
    PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
    to the transformation to device space.  For example if set to
    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
    image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    According to the PDF specification, the maximum stack depth is 32. Other
    viewers tolerate some amount beyond this.  We issue a warning if the
    stack depth exceeds the spec limit and set a hard limit beyond this to
    bound our memory requirements.  If the stack underflows behavior is
    undefined in the spec, but we just pretend nothing happened and leave the
    CTM unchanged.
    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    name_index = defaultdict(lambda: [])
    found_vector = False
    found_text = False
    vector_ops = set('S s f F f* B B* b b*'.split())
    text_showing_ops = set("""TJ Tj " '""".split())
    image_ops = set('BI ID EI q Q Do cm'.split())
    operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops)

    for n, graphobj in enumerate(
        _normalize_stack(
            pikepdf.parse_content_stream(contentstream, operator_whitelist)
        )
    ):
        operands, operator = graphobj
        if operator == 'q':
            stack.append(ctm)
            if len(stack) > 32:  # See docstring
                if len(stack) > 128:
                    raise RuntimeError(
                        "PDF graphics stack overflowed hard limit, operator %i" % n
                    )
                warn("PDF graphics stack overflowed spec limit")
        elif operator == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                # Keeping the ctm the same seems to be the only sensible thing
                # to do. Just pretend nothing happened, keep calm and carry on.
                warn("PDF graphics stack underflowed - PDF may be malformed")
        elif operator == 'cm':
            ctm = PdfMatrix(operands) @ ctm
        elif operator == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(
                name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            xobject_settings.append(settings)
            name_index[image_name].append(settings)
        elif operator == 'INLINE IMAGE':  # BI/ID/EI are grouped into this
            iimage = operands[0]
            inline = InlineSettings(
                iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            inline_images.append(inline)
        elif operator in vector_ops:
            found_vector = True
        elif operator in text_showing_ops:
            found_text = True

    return ContentsInfo(
        xobject_settings=xobject_settings,
        inline_images=inline_images,
        found_vector=found_vector,
        found_text=found_text,
        name_index=name_index,
    )
Example #8
0
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    """Interpret the PDF content stream

    The stack represents the state of the PDF graphics stack.  We are only
    interested in the current transformation matrix (CTM) so we only track
    this object; a full implementation would need to track many other items.

    The CTM is initialized to the mapping from user space to device space.
    PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
    to the transformation to device space.  For example if set to
    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
    image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    found_text = False
    text_operators = set(['Tj', 'TJ', '"', "'"])
    operator_whitelist = """q Q Do cm TJ Tj " ' BI ID EI"""

    for n, op in enumerate(
            _normalize_stack(
                pikepdf.parse_content_stream(contentstream,
                                             operator_whitelist))):
        operands, command = op

        if command == 'q':
            stack.append(ctm)
            if len(stack) > 32:
                raise RuntimeError("PDF graphics stack overflow, command %i" %
                                   n)
        elif command == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                raise RuntimeError("PDF graphics stack underflow, command %i" %
                                   n)
        elif command == 'cm':
            ctm = PdfMatrix(operands) @ ctm
        elif command == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(name=image_name,
                                       shorthand=ctm.shorthand,
                                       stack_depth=len(stack))
            xobject_settings.append(settings)
        elif command == 'INLINE IMAGE':
            iimage = operands[0]
            inline = InlineSettings(iimage=iimage,
                                    shorthand=ctm.shorthand,
                                    stack_depth=len(stack))
            inline_images.append(inline)
        elif command in text_operators:
            found_text = True

    return ContentsInfo(xobject_settings=xobject_settings,
                        inline_images=inline_images,
                        found_text=found_text)
Example #9
0
    def _graft_text_layer(
        self,
        *,
        page_num: int,
        textpdf: Path,
        font: Object,
        font_key: Object,
        procset: Object,
        text_rotation: int,
        strip_old_text: bool,
    ):
        """Insert the text layer from text page 0 on to pdf_base at page_num"""

        log.debug("Grafting")
        if Path(textpdf).stat().st_size == 0:
            return

        # This is a pointer indicating a specific page in the base file
        with Pdf.open(textpdf) as pdf_text:
            pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

            base_page = self.pdf_base.pages.p(page_num)

            # The text page always will be oriented up by this stage but the original
            # content may have a rotation applied. Wrap the text stream with a rotation
            # so it will be oriented the same way as the rest of the page content.
            # (Previous versions OCRmyPDF rotated the content layer to match the text.)
            mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
            wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
            wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            translate = PdfMatrix().translated(-wt / 2, -ht / 2)
            untranslate = PdfMatrix().translated(wp / 2, hp / 2)
            corner = PdfMatrix().translated(mediabox[0], mediabox[1])
            # -rotation because the input is a clockwise angle and this formula
            # uses CCW
            text_rotation = -text_rotation % 360
            rotate = PdfMatrix().rotated(text_rotation)

            # Because of rounding of DPI, we might get a text layer that is not
            # identically sized to the target page. Scale to adjust. Normally this
            # is within 0.998.
            if text_rotation in (90, 270):
                wt, ht = ht, wt
            scale_x = wp / wt
            scale_y = hp / ht

            # log.debug('%r', scale_x, scale_y)
            scale = PdfMatrix().scaled(scale_x, scale_y)

            # Translate the text so it is centered at (0, 0), rotate it there, adjust
            # for a size different between initial and text PDF, then untranslate, and
            # finally move the lower left corner to match the mediabox
            ctm = translate @ rotate @ scale @ untranslate @ corner

            base_resources = _ensure_dictionary(base_page, Name.Resources)
            base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
            text_xobj_name = Name('/' + str(uuid.uuid4()))
            xobj = self.pdf_base.make_stream(pdf_text_contents)
            base_xobjs[text_xobj_name] = xobj
            xobj.Type = Name.XObject
            xobj.Subtype = Name.Form
            xobj.FormType = 1
            xobj.BBox = mediabox
            _update_resources(obj=xobj,
                              font=font,
                              font_key=font_key,
                              procset=[Name.PDF])

            pdf_draw_xobj = ((b'q %s cm\n' % ctm.encode()) +
                             (b'%s Do\n' % text_xobj_name) + b'\nQ\n')
            new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)

            if strip_old_text:
                strip_invisible_text(self.pdf_base, base_page)

            if hasattr(Page, 'contents_add'):
                # pikepdf >= 2.14 adds this method and deprecates the one below
                Page(base_page).contents_add(new_text_layer, prepend=True)
            else:
                # pikepdf < 2.14
                base_page.page_contents_add(new_text_layer,
                                            prepend=True)  # pragma: no cover

            _update_resources(obj=base_page,
                              font=font,
                              font_key=font_key,
                              procset=procset)