コード例 #1
0
def test_page_labels():
    p = Pdf.new()
    d = Dictionary(Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary())
    for n in range(5):
        p.pages.append(d)
        p.pages[n].Contents = Stream(p, b"BT (Page %s) Tj ET" % str(n).encode())

    p.Root.PageLabels = p.make_indirect(
        Dictionary(
            Nums=Array(
                [
                    0,  # new label rules begin at index 0
                    Dictionary(S=Name.r),  # use lowercase roman numerals, until...
                    2,  # new label rules begin at index 2
                    Dictionary(
                        S=Name.D, St=42, P='Prefix-'
                    ),  # label pages as 'Prefix-42', 'Prefix-43', ...
                ]
            )
        )
    )

    labels = ['i', 'ii', 'Prefix-42', 'Prefix-43', 'Prefix-44']
    for n in range(5):
        rawpage = p.pages[n]
        page = Page(rawpage)
        assert page.label == labels[n]
コード例 #2
0
 def test_filter_decodeparms_mismatch(self, stream_object):
     with pytest.raises(ValueError, match=r"filter.*and decode_parms"):
         stream_object.write(
             compress(b'foo'),
             filter=[Name.FlateDecode],
             decode_parms=[Dictionary(), Dictionary()],
         )
コード例 #3
0
def test_oddwidth_grayscale(bits, check_pixels):
    pdf = pikepdf.new()
    pdf.add_blank_page(page_size=(108, 72))

    imobj = Stream(
        pdf,
        bytes([0b00011011, 0b11011000, 0b00000001]),
        BitsPerComponent=bits,
        ColorSpace=Name.DeviceGray,
        Width=3,
        Height=2,
        Type=Name.XObject,
        Subtype=Name.Image,
    )

    pdf.pages[0].Contents = Stream(pdf, b'108 0 0 72 0 0 cm /Im0 Do')
    pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj))

    pim = PdfImage(pdf.pages[0].Resources.XObject.Im0)
    assert pim.mode == 'L'
    assert pim.bits_per_component == bits
    bio = BytesIO()
    pim.extract_to(stream=bio)
    bio.seek(0)
    im = Image.open(bio)
    assert im.mode == 'L'
    assert im.size == (3, 2)

    # pdf.save(f'oddbit_{bits}.pdf')
    for check_x, check_y, val in check_pixels:
        assert im.getpixel((check_x, check_y)) == val
コード例 #4
0
    def test_repr_dict(self):
        d = Dictionary({
            '/Boolean': True,
            '/Integer': 42,
            '/Real': Decimal('42.42'),
            '/String': String('hi'),
            '/Array': Array([1, 2, 3.14]),
            '/Operator': Operator('q'),
            '/Dictionary': Dictionary({'/Color': 'Red'}),
            '/None': None,
        })
        if LooseVersion(pikepdf.__libqpdf_version__) >= LooseVersion('10.2.0'):
            short_pi = '3.14'
        else:
            short_pi = '3.140000'
        expected = ("""\
            pikepdf.Dictionary({
                "/Array": [ 1, 2, Decimal('%s') ],
                "/Boolean": True,
                "/Dictionary": {
                    "/Color": "Red"
                },
                "/Integer": 42,
                "/None": None,
                "/Operator": pikepdf.Operator("q"),
                "/Real": Decimal('42.42'),
                "/String": "hi"
            })
        """ % short_pi)

        def strip_all_whitespace(s):
            return ''.join(s.split())

        assert strip_all_whitespace(repr(d)) == strip_all_whitespace(expected)
        assert eval(repr(d)) == d
コード例 #5
0
    def test_repr_dict(self):
        d = Dictionary({
            '/Boolean': True,
            '/Integer': 42,
            '/Real': Decimal('42.42'),
            '/String': String('hi'),
            '/Array': Array([1, 2, 3.14]),
            '/Operator': Operator('q'),
            '/Dictionary': Dictionary({'/Color': 'Red'})
        })
        expected = """\
            pikepdf.Dictionary({
                "/Array": [ 1, 2, Decimal('3.140000') ],
                "/Boolean": True,
                "/Dictionary": {
                    "/Color": "Red"
                },
                "/Integer": 42,
                "/Operator": pikepdf.Operator("q"),
                "/Real": Decimal('42.42'),
                "/String": "hi"
            })
        """

        def strip_all_whitespace(s):
            return ''.join(s.split())

        assert strip_all_whitespace(repr(d)) == strip_all_whitespace(expected)
        assert eval(repr(d)) == d
コード例 #6
0
ファイル: test_parsers.py プロジェクト: jeromerobert/pikepdf
def test_invalid_stream_object():
    with pytest.raises(TypeError):
        parse_content_stream(42)

    with pytest.raises(TypeError):
        parse_content_stream(Dictionary({"/Hi": 3}))

    with pytest.raises(PdfError):
        false_page = Dictionary(Type=Name.Page, Contents=42)
        parse_content_stream(false_page)
コード例 #7
0
def test_unattached_page():
    rawpage = Dictionary(
        Type=Name.Page, MediaBox=[0, 0, 612, 792], Resources=Dictionary()
    )
    page = Page(rawpage)

    with pytest.raises(ValueError, match='not attached'):
        page.index
    with pytest.raises(ValueError, match='not attached'):
        page.label
コード例 #8
0
def rewrite_png(pike: Pdf, im_obj: Object,
                compdata) -> None:  # pragma: no cover
    # When a PNG is inserted into a PDF, we more or less copy the IDAT section from
    # the PDF and transfer the rest of the PNG headers to PDF image metadata.
    # One thing we have to do is tell the PDF reader whether a predictor was used
    # on the image before Flate encoding. (Typically one is.)
    # According to Leptonica source, PDF readers don't actually need us
    # to specify the correct predictor, they just need a value of either:
    #   1 - no predictor
    #   10-14 - there is a predictor
    # Leptonica's compdata->predictor only tells TRUE or FALSE
    # 10-14 means the actual predictor is specified in the data, so for any
    # number >= 10 the PDF reader will use whatever the PNG data specifies.
    # In practice Leptonica should use Paeth, 14, but 15 seems to be the
    # designated value for "optimal". So we will use 15.
    # See:
    #   - PDF RM 7.4.4.4 Table 10
    #   - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
    predictor = 15 if compdata.predictor > 0 else 1
    dparms = Dictionary(Predictor=predictor)
    if predictor > 1:
        dparms.BitsPerComponent = compdata.bps  # Yes, this is redundant
        dparms.Colors = compdata.spp
        dparms.Columns = compdata.w

    im_obj.BitsPerComponent = compdata.bps
    im_obj.Width = compdata.w
    im_obj.Height = compdata.h

    log.debug(
        f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}"
    )
    if compdata.ncolors > 0:
        # .ncolors is the number of colors in the palette, not the number of
        # colors used in a true color image. The palette string is always
        # given as RGB tuples even when the image is grayscale; see
        # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067
        palette_pdf_string = compdata.get_palette_pdf_string()
        palette_data = pikepdf.Object.parse(palette_pdf_string)
        palette_stream = pikepdf.Stream(pike, bytes(palette_data))
        palette = [
            Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream
        ]
        cs = palette
    else:
        # ncolors == 0 means we are using a colorspace without a palette
        if compdata.spp == 1:
            cs = Name.DeviceGray
        elif compdata.spp == 4:
            cs = Name.DeviceCMYK
        else:  # spp == 3
            cs = Name.DeviceRGB
    im_obj.ColorSpace = cs
    im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
コード例 #9
0
    def from_dictionary_object(cls, obj: Dictionary):
        """Creates a ``OutlineItem`` from a PDF document's ``Dictionary``
        object. Does not process nested items.

        Arguments:
            obj: ``Dictionary`` object representing a single outline node.
        """
        title = str(obj.Title)
        destination = obj.get(Name.Dest)
        action = obj.get(Name.A)
        return cls(title, destination=destination, action=action, obj=obj)
コード例 #10
0
    def _save_level_outline(
        self,
        parent: Dictionary,
        outline_items: Iterable[OutlineItem],
        level: int,
        visited_objs: Set[Tuple[int, int]],
    ):
        count = 0
        prev: Optional[Dictionary] = None
        first: Optional[Dictionary] = None
        for item in outline_items:
            out_obj = item.to_dictionary_object(self._pdf)
            objgen = out_obj.objgen
            if objgen in visited_objs:
                if self._strict:
                    raise OutlineStructureError(
                        f"Outline object {objgen} reoccurred in structure")
                out_obj = item.to_dictionary_object(self._pdf, create_new=True)
            else:
                visited_objs.add(objgen)

            out_obj.Parent = parent
            count += 1
            if prev is not None:
                prev.Next = out_obj
                out_obj.Prev = prev
            else:
                first = out_obj
                if Name.Prev in out_obj:
                    del out_obj.Prev
            prev = out_obj
            if level < self._max_depth:
                sub_items = item.children
            else:
                sub_items = ()
            self._save_level_outline(out_obj, sub_items, level + 1,
                                     visited_objs)
            if item.is_closed:
                out_obj.Count = -cast(int, out_obj.Count)
            else:
                count += cast(int, out_obj.Count)
        if count:
            assert prev is not None and first is not None
            if Name.Next in prev:
                del prev.Next
            parent.First = first
            parent.Last = prev
        else:
            if Name.First in parent:
                del parent.First
            if Name.Last in parent:
                del parent.Last
        parent.Count = count
コード例 #11
0
ファイル: test_parsers.py プロジェクト: mara004/pikepdf
def test_invalid_stream_object():
    with pytest.raises(TypeError, match="must be a pikepdf.Object"):
        parse_content_stream(42)

    with pytest.raises(TypeError, match="called on page or stream"):
        parse_content_stream(Dictionary({"/Hi": 3}))

    with pytest.raises(
            TypeError,
            match="parse_content_stream called on non-stream Object"):
        false_page = Dictionary(Type=Name.Page, Contents=42)
        parse_content_stream(false_page)
コード例 #12
0
def test_copy():
    d = Dictionary({
        '/Boolean': True,
        '/Integer': 42,
        '/Real': Decimal('42.42'),
        '/String': String('hi'),
        '/Array': Array([1, 2, 3.14]),
        '/Dictionary': Dictionary({'/Color': 'Red'}),
    })
    d2 = copy(d)
    assert d2 == d
    assert d2 is not d
    assert d2['/Dictionary'] == d['/Dictionary']
コード例 #13
0
def test_update_info(graph, outdir):
    new_title = '我敢打赌,你只是想看看这意味着什么'
    graph.docinfo['/Title'] = new_title
    graph.save(outdir / 'out.pdf')

    new = Pdf.open(outdir / 'out.pdf')
    assert new.docinfo['/Title'] == new_title
    assert graph.docinfo['/Author'] == new.docinfo['/Author']

    with pytest.raises(ValueError):
        new.docinfo = Dictionary({'/Keywords': 'bob'})

    new.docinfo = graph.make_indirect(Dictionary({'/Keywords': 'bob'}))
    assert new.docinfo.is_indirect, "/Info must be an indirect object"
コード例 #14
0
 def test_ccitt(self, stream_object):
     ccitt = b'\x00'  # Not valid data, just for testing decode_parms
     stream_object.write(
         ccitt,
         filter=Name.CCITTFaxDecode,
         decode_parms=Dictionary(K=-1, Columns=8, Length=1),
     )
コード例 #15
0
    def to_dictionary_object(self,
                             pdf: Pdf,
                             create_new: bool = False) -> Dictionary:
        """Creates a ``Dictionary`` object from this outline node's data,
        or updates the existing object.
        Page numbers are resolved to a page reference on the input
        ``Pdf`` object.

        Arguments:
            pdf: PDF document object.
            create_new: If set to ``True``, creates a new object instead of
                modifying an existing one in-place.
        """
        if create_new or self.obj is None:
            self.obj = obj = pdf.make_indirect(Dictionary())
        else:
            obj = self.obj
        obj.Title = self.title
        if self.destination is not None:
            if isinstance(self.destination, int):
                self.destination = make_page_destination(
                    pdf,
                    self.destination,
                    self.page_location,
                    **self.page_location_kwargs,
                )
            obj.Dest = self.destination
            if Name.A in obj:
                del obj.A
        elif self.action is not None:
            obj.A = self.action
            if Name.Dest in obj:
                del obj.Dest
        return obj
コード例 #16
0
def set_pagelabels(doc, page_labels):
    arr = []
    for label in page_labels:
        pn = label['start'] - 1  # page index 1-based -> 0-based
        d = {}
        if 'style' in label and label['style'] != 'none':
            d['/S'] = Name('/' + label['style'])
        if 'prefix' in label:
            d['/P'] = label['prefix']
        if 'initial_count' in label:
            d['/St'] = label['initial_count']
        obj = Dictionary(d)
        arr.append(pn)
        arr.append(obj)
    obj = Dictionary({'/Nums': Array(arr)})
    doc.root[Name.PageLabels] = obj
コード例 #17
0
ファイル: test_metadata.py プロジェクト: yeus/pikepdf
def test_docinfo_delete_missing(sandwich):
    with sandwich.open_metadata() as m:
        d = Dictionary(Creator="test creator")
        assert 'xmp:CreateDate' in m
        assert m['xmp:CreatorTool'] != 'test creator'
        m.load_from_docinfo(d, delete_missing=True)
        assert m['xmp:CreatorTool'] == 'test creator'
        assert 'xmp:CreateDate' not in m
コード例 #18
0
 def _save(self):
     if self._root is None:
         return
     if Name.Outlines in self._pdf.Root:
         outlines = self._pdf.Root.Outlines
     else:
         self._pdf.Root.Outlines = outlines = self._pdf.make_indirect(
             Dictionary(Type=Name.Outlines))
     self._save_level_outline(outlines, self._root, 0, set())
コード例 #19
0
def test_with_same_owner_as(vera, outlines, outpdf):
    assert vera.Root.is_owned_by(vera)

    # return reference to self
    indirect_dict = vera.make_indirect(Dictionary(Foo=42))
    vera.Root.IndirectDict = indirect_dict
    vera.save(outpdf)

    # copy direct object case
    vera.Root.CopiedDirectNames = Dictionary(Foo=42).with_same_owner_as(
        vera.Root)
    vera.save(outpdf)

    # copy foreign case
    vera.Root.ForeignNames = outlines.Root.Names.with_same_owner_as(vera.Root)
    vera.save(outpdf)

    # invalid other owner case
    with pytest.raises(ValueError):
        outlines.Root.Names.with_same_owner_as(Dictionary(Foo=42))
コード例 #20
0
ファイル: test_page.py プロジェクト: mara004/pikepdf
def test_failed_add_page_cleanup():
    pdf = Pdf.new()
    d = Dictionary(Type=Name.NotAPage)
    num_objects = len(pdf.objects)
    with pytest.raises(TypeError, match="only pages can be inserted"):
        pdf.pages.append(d)
    assert len(pdf.pages) == 0

    # If we fail to add a new page, we expect one new null object handle to be
    # be added (since QPDF does not remove the object outright)
    assert len(pdf.objects) == num_objects + 1, "QPDF semantics changed"
    assert pdf.objects[-1] is None, "Left a stale object behind without deleting"

    # But we'd better not delete an existing object...
    d2 = pdf.make_indirect(Dictionary(Type=Name.StillNotAPage))
    with pytest.raises(TypeError, match="only pages can be inserted"):
        pdf.pages.append(d2)
    assert len(pdf.pages) == 0

    assert d2.same_owner_as(pdf.Root)
コード例 #21
0
def test_json():
    d = Dictionary({
        '/Boolean': True,
        '/Integer': 42,
        '/Real': Decimal('42.42'),
        '/String': String('hi'),
        '/Array': Array([1, 2, 3.14]),
        '/Dictionary': Dictionary({'/Color': 'Red'}),
    })
    json_bytes = d.to_json(False)
    as_dict = json.loads(json_bytes)
    assert as_dict == {
        "/Array": [1, 2, 3.14],
        "/Boolean": True,
        "/Dictionary": {
            "/Color": "Red"
        },
        "/Integer": 42,
        "/Real": 42.42,
        "/String": "hi",
    }
コード例 #22
0
ファイル: test_page.py プロジェクト: knobix/pikepdf
class TestAddResource:
    d = Dictionary(Type=Name.XObject, Subtype=Name.Image, Width=1, Height=1)

    def test_basic(self, graph_page):
        d = self.d

        with pytest.raises(ValueError, match="already exists"):
            graph_page.add_resource(d,
                                    Name.XObject,
                                    Name.Im0,
                                    replace_existing=False)

        res = graph_page.add_resource(d,
                                      Name.XObject,
                                      Name.Im0,
                                      replace_existing=True)
        assert graph_page.resources.XObject[res].Width == 1

        res2 = graph_page.add_resource(d, Name.XObject, prefix='Im')
        assert str(res2).startswith("/Im")
        assert graph_page.resources.XObject[res2].Height == 1

    def test_resources_exists_but_wrong_type(self, graph_page):
        del graph_page.obj.Resources
        graph_page.obj.Resources = Name.Dummy
        with pytest.raises(TypeError, match='exists but is not a dictionary'):
            graph_page.add_resource(self.d,
                                    Name.XObject,
                                    Name.Im0,
                                    replace_existing=False)

    def test_create_resource_dict_if_not_exists(self, graph_page):
        del graph_page.obj.Resources
        graph_page.add_resource(self.d,
                                Name.XObject,
                                Name.Im0,
                                replace_existing=False)
        assert Name.Resources in graph_page.obj

    def test_name_and_prefix(self, graph_page):
        with pytest.raises(ValueError, match="one of"):
            graph_page.add_resource(self.d,
                                    Name.XObject,
                                    name=Name.X,
                                    prefix='y')

    def test_unrecognized_object_not_disturbed(self, graph_page):
        graph_page.obj.Resources.InvalidItem = Array([42])
        graph_page.add_resource(self.d, Name.Pattern)
        assert Name.InvalidItem in graph_page.obj.Resources
コード例 #23
0
ファイル: test_object.py プロジェクト: merll/pikepdf
def test_json():
    d = Dictionary({
        '/Boolean': True,
        '/Integer': 42,
        '/Real': Decimal('42.42'),
        '/String': String('hi'),
        '/Array': Array([1, 2, 3.14]),
        '/Dictionary': Dictionary({'/Color': 'Red'}),
    })
    json_bytes = d.to_json(False)
    try:
        as_dict = json.loads(json_bytes)
    except TypeError:
        as_dict = json.loads(json_bytes.decode('utf-8'))  # Py3.5 shim
    assert as_dict == {
        "/Array": [1, 2, 3.140000],
        "/Boolean": True,
        "/Dictionary": {
            "/Color": "Red"
        },
        "/Integer": 42,
        "/Real": 42.42,
        "/String": "hi",
    }
コード例 #24
0
def test_extract_direct_fails_nondefault_colortransform(congress):
    xobj, _pdf = congress

    xobj.DecodeParms = Dictionary(
        ColorTransform=42  # Non standard (or allowed in the spec)
    )
    pim = PdfImage(xobj)

    bio = BytesIO()
    with pytest.raises(UnsupportedImageTypeError):
        pim._extract_direct(stream=bio)

    xobj.ColorSpace = Name.DeviceCMYK
    pim = PdfImage(xobj)
    with pytest.raises(UnsupportedImageTypeError):
        pim._extract_direct(stream=bio)
コード例 #25
0
ファイル: test_nametree.py プロジェクト: mara004/pikepdf
def test_nametree_crud(outline):
    nt = NameTree(outline.Root.Names.Dests)
    assert nt.obj == outline.Root.Names.Dests
    assert '0' in nt
    assert isinstance(nt['0'], Object)
    assert 'foo' not in nt

    assert '3' in nt
    del nt['3']
    assert '3' not in nt

    nt['3'] = Dictionary(Entry=3)
    assert nt['3'].Entry == 3

    nt['newentry'] = Array([42])
    assert nt['newentry'] == Array([42])

    nt['py_newentry'] = 42
コード例 #26
0
def rewrite_png_as_g4(pike: Pdf, im_obj: Object, compdata) -> None:  # pragma: no cover
    im_obj.BitsPerComponent = 1
    im_obj.Width = compdata.w
    im_obj.Height = compdata.h

    im_obj.write(compdata.read())

    log.debug(f"PNG to G4 {im_obj.objgen}")
    if Name.Predictor in im_obj:
        del im_obj.Predictor
    if Name.DecodeParms in im_obj:
        del im_obj.DecodeParms
    im_obj.DecodeParms = Dictionary(
        K=-1, BlackIs1=bool(compdata.minisblack), Columns=compdata.w
    )

    im_obj.Filter = Name.CCITTFaxDecode
    return
コード例 #27
0
def convert_to_jbig2(
    pike: Pdf,
    jbig2_groups: Dict[int, List[XrefExt]],
    root: Path,
    options,
    executor: Executor,
) -> None:
    """Convert images to JBIG2 and insert into PDF.

    When the JBIG2 page group size is > 1 we do several JBIG2 images at once
    and build a symbol dictionary that will span several pages. Each JBIG2
    image must reference to its symbol dictionary. If too many pages shared the
    same dictionary JBIG2 encoding becomes more expensive and less efficient.
    The default value of 10 was determined through testing. Currently this
    must be lossy encoding since jbig2enc does not support refinement coding.

    When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own
    and needs no dictionary. Currently this must be lossless JBIG2.
    """
    jbig2_globals_dict: Optional[Dictionary]

    _produce_jbig2_images(jbig2_groups, root, options, executor)

    for group, xref_exts in jbig2_groups.items():
        prefix = f'group{group:08d}'
        jbig2_symfile = root / (prefix + '.sym')
        if jbig2_symfile.exists():
            jbig2_globals_data = jbig2_symfile.read_bytes()
            jbig2_globals = Stream(pike, jbig2_globals_data)
            jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals)
        elif options.jbig2_page_group_size == 1:
            jbig2_globals_dict = None
        else:
            raise FileNotFoundError(jbig2_symfile)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + f'.{n:04d}')
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(jbig2_im_data,
                         filter=Name.JBIG2Decode,
                         decode_parms=jbig2_globals_dict)
コード例 #28
0
def test_dict_or_array_dict():
    pdf = pikepdf.new()
    imobj = Stream(
        pdf,
        b'dummy',
        BitsPerComponent=1,
        ColorSpace=Name.DeviceGray,
        DecodeParms=Array([Dictionary(
            BlackIs1=False,
            Columns=16,
            K=-1,
        )]),
        Filter=Array([Name.CCITTFaxDecode]),
        Height=16,
        Width=16,
        Type=Name.XObject,
        Subtype=Name.Image,
    )
    pim = pikepdf.PdfImage(imobj)
    assert pim.decode_parms[
        0].K == -1  # Check that array of dict is unpacked properly
コード例 #29
0
ファイル: test_page.py プロジェクト: knobix/pikepdf
def test_fourpages_to_4up(fourpages, graph, outpdf):
    pdf = Pdf.new()
    pdf.add_blank_page(page_size=(1000, 1000))
    page = Page(pdf.pages[0])

    pdf.pages.extend(fourpages.pages)

    page.add_overlay(pdf.pages[1], Rectangle(0, 500, 500, 1000))
    page.add_overlay(Page(pdf.pages[2]), Rectangle(500, 500, 1000, 1000))
    page.add_overlay(
        Page(pdf.pages[3]).as_form_xobject(), Rectangle(0, 0, 500, 500))
    page.add_underlay(pdf.pages[4], Rectangle(500, 0, 1000, 500))

    page.add_underlay(graph.pages[0])

    with pytest.raises(TypeError):
        page.add_overlay(Dictionary(Key=123))

    del pdf.pages[1:]

    pdf.save(outpdf)
コード例 #30
0
ファイル: test_outlines.py プロジェクト: yeus/pikepdf
def test_dest_or_action(outlines_doc):
    first_obj = outlines_doc.Root.Outlines.First
    first_page = outlines_doc.pages[0]
    assert '/A' in first_obj
    assert '/Dest' not in first_obj
    with outlines_doc.open_outline() as outline:
        first = outline.root[0]
        # Set to first page.
        first.destination = 0
    # Reference should be replaced at this point.
    assert first.destination == [first_page, Name.Fit]
    assert first_obj.Dest == first.destination
    # Original action should be gone
    assert '/A' not in first_obj
    # Now save with a new action instead
    with outlines_doc.open_outline() as outline:
        first = outline.root[0]
        first.action = Dictionary(D=first.destination, S=Name.GoTo)
        first.destination = None
    assert first_obj.A.D == [first_page, Name.Fit]
    assert '/Dest' not in first_obj
コード例 #31
0
ファイル: optimize.py プロジェクト: jbarlow83/OCRmyPDF
def transcode_pngs(pike, images, image_name_fn, root, log, options):
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=options.jobs
        ) as executor:
            for xref in images:
                log.debug(image_name_fn(root, xref))
                executor.submit(
                    pngquant.quantize,
                    image_name_fn(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )

    for xref in images:
        im_obj = pike.get_object(xref, 0)
        try:
            compdata = leptonica.CompressedData.open(png_name(root, xref))
        except leptonica.LeptonicaError as e:
            # Most likely this means file not found, i.e. quantize did not
            # produce an improved version
            log.error(e)
            continue

        # If re-coded image is larger don't use it - we test here because
        # pngquant knows the size of the temporary output file but not the actual
        # object in the PDF
        if len(compdata) > int(im_obj.stream_dict.Length):
            log.debug(
                f"pngquant: pngquant did not improve over original image "
                f"{len(compdata)} > {int(im_obj.stream_dict.Length)}"
            )
            continue

        # When a PNG is inserted into a PDF, we more or less copy the IDAT section from
        # the PDF and transfer the rest of the PNG headers to PDF image metadata.
        # One thing we have to do is tell the PDF reader whether a predictor was used
        # on the image before Flate encoding. (Typically one is.)
        # According to Leptonica source, PDF readers don't actually need us
        # to specify the correct predictor, they just need a value of either:
        #   1 - no predictor
        #   10-14 - there is a predictor
        # Leptonica's compdata->predictor only tells TRUE or FALSE
        # From there the PNG decoder can infer the rest from the file.
        # In practice the predictor should be Paeth, 14, so we'll use that.
        # See:
        #   - PDF RM 7.4.4.4 Table 10
        #   - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
        predictor = 14 if compdata.predictor > 0 else 1
        dparms = Dictionary(Predictor=predictor)
        if predictor > 1:
            dparms.BitsPerComponent = compdata.bps  # Yes, this is redundant
            dparms.Colors = compdata.spp
            dparms.Columns = compdata.w

        im_obj.BitsPerComponent = compdata.bps
        im_obj.Width = compdata.w
        im_obj.Height = compdata.h

        if compdata.ncolors > 0:
            # .ncolors is the number of colors in the palette, not the number of
            # colors used in a true color image
            palette_pdf_string = compdata.get_palette_pdf_string()
            palette_data = pikepdf.Object.parse(palette_pdf_string)
            palette_stream = pikepdf.Stream(pike, bytes(palette_data))
            palette = [
                Name.Indexed,
                Name.DeviceRGB,
                compdata.ncolors - 1,
                palette_stream,
            ]
            cs = palette
        else:
            if compdata.spp == 1:
                # PDF interprets binary-1 as black in 1bpp, but PNG sets
                # black to 0 for 1bpp. Create a palette that informs the PDF
                # of the mapping - seems cleaner to go this way but pikepdf
                # needs to be patched to support it.
                # palette = [Name.Indexed, Name.DeviceGray, 1, b"\xff\x00"]
                # cs = palette
                cs = Name.DeviceGray
            elif compdata.spp == 3:
                cs = Name.DeviceRGB
            elif compdata.spp == 4:
                cs = Name.DeviceCMYK
        if compdata.bps == 1:
            im_obj.Decode = [1, 0]  # Bit of a kludge but this inverts photometric too
        im_obj.ColorSpace = cs
        im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)