Ejemplo n.º 1
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = Page(page)
    rich_page.contents_coalesce()
    for operands, operator in parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    content_stream = unparse_content_stream(stream)
    page.Contents = Stream(pdf, content_stream)
Ejemplo n.º 2
0
def test_filter_names(pal):
    page = Page(pal.pages[0])
    filter = FilterCollectNames()
    result = page.get_filtered_contents(filter)
    assert result == b''
    assert filter.names == ['/Im0']
    after = page.obj.Contents.read_bytes()
    assert after != b''
Ejemplo n.º 3
0
def test_page_index(fourpages):
    for n, page in enumerate(fourpages.pages):
        assert Page(page).index == n
        assert fourpages.pages.index(page) == n
    del fourpages.pages[1]
    for n, page in enumerate(fourpages.pages):
        assert Page(page).index == n
        assert fourpages.pages.index(page) == n
Ejemplo n.º 4
0
def test_formx(graph, outpdf):
    formx = Page(graph.pages[0]).as_form_xobject()
    graph.add_blank_page()
    new_page = Page(graph.pages[-1])
    formx_placed_name = new_page.add_resource(formx, Name.XObject)
    cs = new_page.calc_form_xobject_placement(formx, formx_placed_name,
                                              Rectangle(0, 0, 200, 200))
    assert bytes(formx_placed_name) in cs
    new_page.obj.Contents = graph.make_stream(cs)
    graph.save(outpdf)
Ejemplo n.º 5
0
def test_page_contents_add(graph, outdir):
    pdf = graph

    mat = PdfMatrix().rotated(45)

    stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm')
    stream2 = Stream(pdf, b'Q')

    Page(pdf.pages[0]).contents_add(stream1, True)
    Page(pdf.pages[0]).contents_add(stream2, False)
    pdf.save(outdir / 'out.pdf')

    with pytest.raises(TypeError, match="Not a Page"):
        with pytest.deprecated_call():
            Array([42]).page_contents_add(stream1)
Ejemplo n.º 6
0
def test_page_index_foreign_page(fourpages, sandwich):
    with pytest.raises(ValueError, match="Page is not in this Pdf"):
        fourpages.pages.index(sandwich.pages[0])

    p3 = fourpages.pages[2]
    assert Page(p3).index == 2
    fourpages.pages.insert(2, sandwich.pages[0])
    assert Page(fourpages.pages[2]).index == 2
    assert Page(p3).index == 3

    assert fourpages.pages.index(p3) == 3
    assert fourpages.pages.index(Page(p3)) == 3

    with pytest.raises(ValueError, match="Page is not in this Pdf"):
        # sandwich.pages[0] is still not "in" fourpages; it gets copied into it
        assert fourpages.pages.index(sandwich.pages[0])
Ejemplo n.º 7
0
def test_page_labels():
    p = Pdf.new()
    d = Dictionary(Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary())
    for n in range(5):
        p.pages.append(d)
        p.pages[n].Contents = Stream(p, b"BT (Page %s) Tj ET" % str(n).encode())

    p.Root.PageLabels = p.make_indirect(
        Dictionary(
            Nums=Array(
                [
                    0,  # new label rules begin at index 0
                    Dictionary(S=Name.r),  # use lowercase roman numerals, until...
                    2,  # new label rules begin at index 2
                    Dictionary(
                        S=Name.D, St=42, P='Prefix-'
                    ),  # label pages as 'Prefix-42', 'Prefix-43', ...
                ]
            )
        )
    )

    labels = ['i', 'ii', 'Prefix-42', 'Prefix-43', 'Prefix-44']
    for n in range(5):
        rawpage = p.pages[n]
        page = Page(rawpage)
        assert page.label == labels[n]
Ejemplo n.º 8
0
def test_unattached_page():
    rawpage = Dictionary(
        Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary()
    )
    page = Page(rawpage)

    with pytest.raises(ValueError, match='not attached'):
        page.index
    with pytest.raises(ValueError, match='not attached'):
        page.label
Ejemplo n.º 9
0
def test_fourpages_to_4up(fourpages, graph, outpdf):
    pdf = Pdf.new()
    pdf.add_blank_page(page_size=(1000, 1000))
    page = pdf.pages[0]

    pdf.pages.extend(fourpages.pages)

    # Keep explicit Page(pdf.pages[..]) here
    page.add_overlay(pdf.pages[1], Rectangle(0, 500, 500, 1000))
    page.add_overlay(Page(pdf.pages[2]), Rectangle(500, 500, 1000, 1000))
    page.add_overlay(Page(pdf.pages[3]).as_form_xobject(), Rectangle(0, 0, 500, 500))
    page.add_underlay(pdf.pages[4], Rectangle(500, 0, 1000, 500))

    page.add_underlay(graph.pages[0])

    with pytest.raises(TypeError):
        page.add_overlay(Dictionary(Key=123))

    del pdf.pages[1:]

    pdf.save(outpdf)
Ejemplo n.º 10
0
 def __str__(self):
     if self.children:
         if self.is_closed:
             oc_indicator = '[+]'
         else:
             oc_indicator = '[-]'
     else:
         oc_indicator = '[ ]'
     if self.destination is not None:
         raw_page = self.destination[0]
         page = Page(raw_page)
         dest = page.label
     else:
         dest = '<Action>'
     return f'{oc_indicator} {self.title} -> {dest}'
Ejemplo n.º 11
0
def test_issue160_tokenfilter_refcounting(resources, outpdf):
    # Ensure that add_content_token_filter properly "remembers" token filters
    # that are not needed until .save()
    class MyFilter(TokenFilter):
        def __init__(self, replace):
            super().__init__()
            self.replace = bytes(replace, 'ascii')

        def handle_token(self, tok):
            if tok.type_ == TokenType.string:
                l = len(tok.raw_value)
                s = self.replace * l
                return Token(TokenType.string, s)
            return tok

    with Pdf.open(resources / 'outlines.pdf') as pdf:
        pages = pdf.pages
        num = 0
        for page in pages:
            page = Page(page)
            f = MyFilter(('%d' % num)[-1])
            page.add_content_token_filter(f)
            num += 1
        pdf.save(outpdf)
Ejemplo n.º 12
0
def test_externalize(resources):
    with Pdf.open(resources / 'image-mono-inline.pdf') as p:
        page = Page(p.pages[0])
        page.contents_coalesce()
        assert b'BI' in page.obj.Contents.read_bytes(), "no inline image"

        assert Name.XObject not in page.obj.Resources, "expected no xobjs"
        page.externalize_inline_images()

        assert Name.XObject in page.obj.Resources, "image not created"

        pdfimagexobj = next(iter(p.pages[0].images.values()))
        assert pdfimagexobj.Subtype == Name.Image

        assert page.label == '1'
Ejemplo n.º 13
0
    def _graft_text_layer(
        self,
        *,
        page_num: int,
        textpdf: Path,
        font: Object,
        font_key: Object,
        procset: Object,
        text_rotation: int,
        strip_old_text: bool,
    ):
        """Insert the text layer from text page 0 on to pdf_base at page_num"""

        log.debug("Grafting")
        if Path(textpdf).stat().st_size == 0:
            return

        # This is a pointer indicating a specific page in the base file
        with Pdf.open(textpdf) as pdf_text:
            pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

            base_page = self.pdf_base.pages.p(page_num)

            # The text page always will be oriented up by this stage but the original
            # content may have a rotation applied. Wrap the text stream with a rotation
            # so it will be oriented the same way as the rest of the page content.
            # (Previous versions OCRmyPDF rotated the content layer to match the text.)
            mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
            wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
            wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            translate = PdfMatrix().translated(-wt / 2, -ht / 2)
            untranslate = PdfMatrix().translated(wp / 2, hp / 2)
            corner = PdfMatrix().translated(mediabox[0], mediabox[1])
            # -rotation because the input is a clockwise angle and this formula
            # uses CCW
            text_rotation = -text_rotation % 360
            rotate = PdfMatrix().rotated(text_rotation)

            # Because of rounding of DPI, we might get a text layer that is not
            # identically sized to the target page. Scale to adjust. Normally this
            # is within 0.998.
            if text_rotation in (90, 270):
                wt, ht = ht, wt
            scale_x = wp / wt
            scale_y = hp / ht

            # log.debug('%r', scale_x, scale_y)
            scale = PdfMatrix().scaled(scale_x, scale_y)

            # Translate the text so it is centered at (0, 0), rotate it there, adjust
            # for a size different between initial and text PDF, then untranslate, and
            # finally move the lower left corner to match the mediabox
            ctm = translate @ rotate @ scale @ untranslate @ corner

            base_resources = _ensure_dictionary(base_page, Name.Resources)
            base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
            text_xobj_name = Name('/' + str(uuid.uuid4()))
            xobj = self.pdf_base.make_stream(pdf_text_contents)
            base_xobjs[text_xobj_name] = xobj
            xobj.Type = Name.XObject
            xobj.Subtype = Name.Form
            xobj.FormType = 1
            xobj.BBox = mediabox
            _update_resources(obj=xobj,
                              font=font,
                              font_key=font_key,
                              procset=[Name.PDF])

            pdf_draw_xobj = ((b'q %s cm\n' % ctm.encode()) +
                             (b'%s Do\n' % text_xobj_name) + b'\nQ\n')
            new_text_layer = Stream(self.pdf_base, pdf_draw_xobj)

            if strip_old_text:
                strip_invisible_text(self.pdf_base, base_page)

            if hasattr(Page, 'contents_add'):
                # pikepdf >= 2.14 adds this method and deprecates the one below
                Page(base_page).contents_add(new_text_layer, prepend=True)
            else:
                # pikepdf < 2.14
                base_page.page_contents_add(new_text_layer,
                                            prepend=True)  # pragma: no cover

            _update_resources(obj=base_page,
                              font=font,
                              font_key=font_key,
                              procset=procset)
Ejemplo n.º 14
0
def test_unindexed_page(graph):
    rawpage = graph.pages[0]
    page = Page(rawpage)
    del graph.pages[0]
    with pytest.raises(ValueError, match='not consistently registered'):
        page.index
Ejemplo n.º 15
0
def graph_page(graph):
    return Page(graph.pages[0])
Ejemplo n.º 16
0
def test_filter_thru(pal, filter, expected):
    page = Page(pal.pages[0])
    page.add_content_token_filter(filter())
    after = page.obj.Contents.read_bytes()
    assert after == expected
Ejemplo n.º 17
0
def test_invalid_handle_token(pal):
    page = Page(pal.pages[0])
    with pytest.raises((TypeError, PdfError)):
        page.get_filtered_contents(FilterInvalid())
Ejemplo n.º 18
0
def test_has_text(resources, test_file, expected):
    pdf = Pdf.open(resources / test_file)
    for p in pdf.pages:
        page = Page(p)
        assert page.has_text() == expected
Ejemplo n.º 19
0
def test_tokenfilter_is_abstract(pal):
    page = Page(pal.pages[0])
    with pytest.raises((RuntimeError, PdfError)):
        page.get_filtered_contents(TokenFilter())
Ejemplo n.º 20
0
def test_invalid_tokenfilter(pal):
    page = Page(pal.pages[0])
    with pytest.raises(TypeError):
        page.get_filtered_contents(list())