Exemple #1
0
def create_blank_page(_pdf=None, width=None, height=None):
    """
    Returns a new blank page.
    If width or height is None, try to get the page size from the
    last page of PyPDF. If PyPDF is None or contains no page, a
    PageSizeNotDefinedError is raised.

     _pdf -- PDF file the page belongs to
    width -- The width of the new page expressed in default user space units.
    height -- The height of the new page expressed in default user space units.
    """
    page = PageObject(_pdf)

    # Creates a new page (cf PDF Reference  7.7.3.3)
    page.__setitem__(NameObject(_k.TYPE), NameObject(_k.PAGE))
    page.__setitem__(NameObject(b'/Parent'), NullObject())
    page.__setitem__(NameObject(_k.RESOURCES), DictObject())
    if width is None or height is None:
        if _pdf is not None and _pdf.get_pages_count() > 0:
            last_page = _pdf.get_page(_pdf.get_pages_count() - 1)
            width = last_page.media_box.get_width()
            height = last_page.media_box.get_height()
        else:
            raise utils.PageSizeNotDefinedError()
    page.__setitem__(NameObject(_k.MEDIA_BOX),
                     RectangleObject([0, 0, width, height]))
    return page
Exemple #2
0
    def _chop_images(self):
        pages_count = len(self._flattened_pages)
        for i in range(pages_count):
            page = self._flattened_pages[i]
            bytes_data: bytes = page[_k.CONTENT].get_data()
            parts = _pattern_space.split(bytes_data)
            the_image = page[_k.RESOURCES][b'/XObject'][_get_image_name_from(parts)]
            _u.debug(parts, len(page[_k.RESOURCES][b'/XObject']), the_image)
            image_data = the_image.get_data()
            # _u.debug(the_image, len(image_data))
            if b'/Subtype' in the_image and the_image[b'/Subtype'] == b'/Image':
                (
                    width, height, compressed_length, compressed_data,
                ) = _im.chop_off_image_empty_edges(the_image, image_data, i + 1)
                the_image[NameObject(b'/Length')] = NumberObject(compressed_length)
                the_image[NameObject(b'/Width')] = NumberObject(width)
                the_image[NameObject(b'/Height')] = NumberObject(height)
                the_image._bytes_data = compressed_data

                # We might need to insert this matrix in the below line: 1 0 0 1 0 100 cm
                page[_k.CONTENT].set_data(
                    b'q ' + _u.s2b(str(width)) + b' 0 0 ' + _u.s2b(str(height)) + b' 0 0 cm ' + parts[-5] + b' Do Q'
                )
                page[_k.MEDIA_BOX][2] = NumberObject(width)
                page[_k.MEDIA_BOX][3] = NumberObject(height)
                _u.debug(
                    'Chopped empty edges for {:4}/{} image.'.format(i + 1, pages_count),
                    page[_k.MEDIA_BOX], page[_k.MEDIA_BOX][2:], parts, width, height,
                    page[_k.CONTENT].get_data(),
                )
            else:
                _u.debug(image_data)
                pass
Exemple #3
0
    def merge_page(self, page2, page2transformation=None):
        """Merges the content streams of two pages into one. Resource
        references (i.e. fonts) are maintained from both pages. The
        mediabox/cropbox/etc of this page are not altered. The parameter page's
        content stream will be added to the end of this page's content stream,
        meaning that it will be drawn after, or "on top" of this page.

        page2 - An instance of {@link #PageObject PageObject} to be merged
                into this one.
        page2transformation - A function which applies a transformation to
                              the content stream of page2. Takes: page2
                              contents stream. Must return: new contents
                              stream. If omitted, the content stream will
                              not be modified."""
        # First we work on merging the resource dictionaries.  This allows us
        # to find out what symbols in the content streams we might need to
        # rename.
        new_resources = DictObject()
        rename = {}
        original_resources = self[_k.RESOURCES].get_object()
        page2_resources = page2[_k.RESOURCES].get_object()

        for res in b'/ExtGState', b'/Font', b'/XObject', b'/ColorSpace', b'/Pattern', b'/Shading', b'/Properties':
            new, new_name = _merge_resources(original_resources,
                                             page2_resources, res)
            if new:
                new_resources[NameObject(res)] = new
                rename.update(new_name)

        # Combine /ProcSet sets.
        new_resources[NameObject(b'/ProcSet')] = ArrayObject(
            frozenset(
                original_resources.get(
                    b'/ProcSet', ArrayObject()).get_object()).union(
                        frozenset(
                            page2_resources.get(b'/ProcSet',
                                                ArrayObject()).get_object())))

        new_content_array = ArrayObject()
        original_content = self.get_contents()
        if original_content is not None:
            new_content_array.append(
                _push_pop_graphics_state(original_content, self.parent))

        page2_content = page2.get_contents()
        if page2_content is not None:
            if page2transformation is not None:
                page2_content = page2transformation(page2_content)
            page2_content = _content_stream_rename(page2_content, rename,
                                                   self.parent)
            page2_content = _push_pop_graphics_state(page2_content,
                                                     self.parent)
            new_content_array.append(page2_content)

        self[NameObject(_k.CONTENT)] = _ContentStreamObject(
            new_content_array, self.parent)
        self[NameObject(_k.RESOURCES)] = new_resources
Exemple #4
0
    def __init__(self):
        self._id = None
        self._encrypt = None
        self._encrypt_key = None
        self._objects = []  # array of indirect objects

        # The root of our page tree node.
        pages = DictObject()
        pages.update({
            NameObject(_k.TYPE): NameObject(_k.PAGES),
            NameObject(_k.COUNT): NumberObject(0),
            NameObject(_k.KIDS): ArrayObject(),
        })
        self._pages = self._add_object(pages)

        # info object
        info = DictObject()
        info.update({
            NameObject(b'/Producer'): create_string_object(b'PyPDF - Refactored by QXF')
        })
        self._info = self._add_object(info)

        # root object
        self.__outlines = self._add_object(DictObject())
        root = DictObject()
        root.update({
            NameObject(_k.TYPE): NameObject(b'/Catalog'),
            NameObject(_k.PAGES): self._pages,
            NameObject(_k.OUTLINES): self.__outlines,
        })
        self._root = self._add_object(root)
Exemple #5
0
 def _write_trailer_to(self, stream):
     stream.write(b'trailer\n')
     trailer = DictObject()
     trailer.update({
         NameObject(_k.SIZE): NumberObject(len(self._objects) + 1),
         NameObject(_k.ROOT): self._root,
         NameObject(_k.INFO): self._info,
     })
     if self._id is not None:
         trailer[NameObject(_k.ID)] = self._id
     if self._encrypt is not None:
         trailer[NameObject(_k.ENCRYPT)] = self._encrypt
     trailer.write_to_stream(stream)
Exemple #6
0
    def _add_page(self, page, callback_add):
        """
        Common method for inserting or adding a page to this PDF file.

        page - The page to add to the document.  This argument should be
                    an instance of {@link #PageObject PageObject}.
        callback_add - The function which will insert the page in the dictionary.
                      Takes: page list, page to add.
        """
        assert page[_k.TYPE] == _k.PAGE
        page[NameObject(b'/Parent')] = self._pages
        page = self._add_object(page)
        pages = self._pages.get_object()
        callback_add(pages[_k.KIDS], page)
        pages[NameObject(_k.COUNT)] = NumberObject(pages[_k.COUNT] + 1)
Exemple #7
0
 def add_transformation(self, ctm):
     original_content = self.get_contents()
     if original_content is not None:
         new_content = _add_transformation_matrix(original_content,
                                                  self.parent, ctm)
         new_content = _push_pop_graphics_state(new_content, self.parent)
         self[NameObject(_k.CONTENT)] = new_content
Exemple #8
0
 def __parse_xref_stream(self):
     # PDF 1.5+ Cross-Reference Stream
     self._stream.seek(-1, io.SEEK_CUR)
     idnum, generation = _read_object_header(self._stream)
     xrefstream = read_object(self._stream, self)
     assert xrefstream[_k.TYPE] == b'/XRef'
     self._cache_indirect_object(generation, idnum, xrefstream)
     # _u.debug(xrefstream)
     stream_data = BytesIO(xrefstream.get_data())
     idx_pairs = xrefstream.get(b'/Index', [0, xrefstream.get(_k.SIZE)])
     entry_sizes = xrefstream.get(b'/W')
     for num, size in _generate_pairs(idx_pairs):
         cnt = 0
         xref_type = None
         byte_offset = None
         objstr_num = None
         obstr_idx = None
         while cnt < size:
             for i in range(len(entry_sizes)):
                 d = stream_data.read(entry_sizes[i])
                 di = _convert_to_int(d, entry_sizes[i])
                 if i == 0:
                     xref_type = di
                 elif i == 1:
                     if xref_type == 0:
                         _next_free_object = di
                     elif xref_type == 1:
                         byte_offset = di
                     elif xref_type == 2:
                         objstr_num = di
                 elif i == 2:
                     if xref_type == 0:
                         _next_generation = di
                     elif xref_type == 1:
                         generation = di
                     elif xref_type == 2:
                         obstr_idx = di
             if xref_type == 0:
                 _u.debug('Ignored xref_type == 0, do not know why.')
             elif xref_type == 1:
                 if generation not in self._xref:
                     self._xref[generation] = {}
                 if num not in self._xref[generation]:
                     self._xref[generation][num] = byte_offset
             elif xref_type == 2:
                 if num not in self._xref_obj_stream:
                     self._xref_obj_stream[num] = [objstr_num, obstr_idx]
             else:
                 _u.debug(f'Unknown xref_type {xref_type}. Ignored.')
             cnt += 1
             num += 1
     trailer_keys = _k.ROOT, _k.ENCRYPT, _k.INFO, _k.ID
     for key in trailer_keys:
         if key in xrefstream and key not in self._trailer:
             self._trailer[NameObject(key)] = xrefstream.raw_get(key)
     if b'/Prev' in xrefstream:
         return xrefstream[b'/Prev']
     else:
         return None
Exemple #9
0
def _merge_resources(res1, res2, resource):
    new_res = DictObject()
    new_res.update(res1.get(resource, DictObject()).get_object())
    page2_res = res2.get(resource, DictObject()).get_object()
    rename_res = {}
    for key in page2_res.keys():
        if key in new_res and new_res[key] != page2_res[key]:
            new_name = NameObject(key + b'renamed')
            rename_res[key] = new_name
            new_res[new_name] = page2_res[key]
        elif key not in new_res:
            new_res[key] = page2_res.raw_get(key)
    return new_res, rename_res
Exemple #10
0
    def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True):
        """Encrypt this PDF file with the PDF Standard encryption handler.

        user_pwd - The "user password", which allows for opening and reading
                the PDF file with the restrictions provided.
        owner_pwd - The "owner password", which allows for opening the PDF
                files without any restrictions.  By default, the owner password is the
                same as the user password.
        use_128bit - Boolean argument as to whether to use 128bit
                encryption.  When false, 40bit encryption will be used.  By default, this
                flag is on."""
        if owner_pwd is None:
            owner_pwd = user_pwd
        if use_128bit:
            v = 2
            rev = 3
            keylen = 128 / 8
        else:
            v = 1
            rev = 2
            keylen = 40 / 8
        # permit everything:
        p = -1
        o = ByteStringObject(_u.algorithm_33(owner_pwd, user_pwd, rev, keylen))
        id_1 = _md5(bytes(repr(time.time()), _u.ENCODING_UTF8)).digest()
        id_2 = _md5(bytes(repr(random.random()), _u.ENCODING_UTF8)).digest()
        self._id = ArrayObject((ByteStringObject(id_1), ByteStringObject(id_2)))
        if rev == 2:
            u, key = _u.algorithm_34(user_pwd, o, p, id_1)
        else:
            assert rev == 3
            u, key = _u.algorithm_35(user_pwd, rev, keylen, o, p, id_1, False)
        encrypt = DictObject()
        encrypt[NameObject(b'/Filter')] = NameObject(b'/Standard')
        encrypt[NameObject(b'/V')] = NumberObject(v)
        if v == 2:
            encrypt[NameObject(b'/Length')] = NumberObject(keylen * 8)
        encrypt[NameObject(b'/R')] = NumberObject(rev)
        encrypt[NameObject(b'/O')] = ByteStringObject(o)
        encrypt[NameObject(b'/U')] = ByteStringObject(u)
        encrypt[NameObject(b'/P')] = NumberObject(p)
        self._encrypt = self._add_object(encrypt)
        self._encrypt_key = key
Exemple #11
0
    def _build_outline(self, node):
        dest, title, outline = None, None, None

        if b'/A' in node and _k.TITLE in node:
            # Action, section 8.5 (only type GoTo supported)
            title = node[_k.TITLE]
            action = node[b'/A']
            if action[b'/S'] == b'/GoTo':
                dest = action[b'/D']
        elif _k.DEST in node and _k.TITLE in node:
            # DestObject, section 8.2.1
            title = node[_k.TITLE]
            dest = node[_k.DEST]

        # if destination found, then create outline
        if dest:
            if isinstance(dest, ArrayObject):
                outline = _build_destination(title, dest)
            elif dest in isinstance(dest, str) and self._named_dests:
                outline = self._named_dests[dest]
                outline[NameObject(_k.TITLE)] = title
            else:
                raise _u.PdfReadError("Unexpected destination %r" % dest)
        return outline
Exemple #12
0
 def get_pages_count(self):
     pages = self._pages.get_object()
     return int(pages[NameObject(_k.COUNT)])
Exemple #13
0
import PyPDF.compound as _c
import PyPDF.image_tools as _im
import PyPDF.keys as _k
import PyPDF.utils as _u
from PyPDF.generic import (
    NameObject, NumberObject, BooleanObject, TextStringObject,
    RefObject, ByteStringObject,
    DictObject, ArrayObject,
    StreamObject, DocInfoObject, DestObject,
    create_string_object, read_object,
)

_pattern_space = _re.compile(br'\s')
_inheritable_page_attributes = (
    NameObject(_k.RESOURCES),
    NameObject(_k.MEDIA_BOX),
    NameObject(_k.CROP_BOX),
    NameObject(_k.ROTATE),
)


class PdfFileReader(object):
    def __init__(self, stream, should_chop: bool = False):
        """
        Initializes a PdfFileReader object.  This operation can take some time, as
        the PDF stream's cross-reference tables are read into memory.

        Stability: Added in v1.0, will exist for all v1.x releases.

        stream - An object that supports the standard read
Exemple #14
0
def _set_rectangle(this, name, value):
    if not isinstance(name, NameObject):
        name = NameObject(name)
    this[name] = value
Exemple #15
0
 def compress_content_streams(self):
     content = self.get_contents()
     if content is not None:
         if not isinstance(content, _ContentStreamObject):
             content = _ContentStreamObject(content, self.parent)
         self[NameObject(_k.CONTENT)] = content.flate_encode()
Exemple #16
0
 def add_bookmark(self, title: str, page_index: int, container_ref=None):
     container_ref = self.__outlines if container_ref is None else container_ref
     title_obj = TextStringObject(title)
     mark_mami = DictObject()
     target_page = self.get_page_ref(page_index - 1)
     mark_mami.update({
         NameObject(_k.TITLE): title_obj,
         NameObject(_k.PARENT): container_ref,
         NameObject(_k.DEST): ArrayObject([target_page, NameObject(b'/Fit')]),
     })
     container = container_ref.get_object()
     mark_ref = self._add_object(mark_mami)
     if _k.COUNT in container:
         container[NameObject(_k.COUNT)] += 1
         last_mark = container[NameObject(_k.LAST)]
         container[_k.LAST] = mark_ref
         last_mark[_k.NEXT] = mark_ref
     else:
         container[NameObject(_k.TYPE)] = NameObject(_k.OUTLINES)
         container[NameObject(_k.COUNT)] = NumberObject(1)
         container[NameObject(_k.FIRST)] = mark_ref
         container[NameObject(_k.LAST)] = mark_ref
     return mark_ref
Exemple #17
0
 def _rotate(self, angle):
     current_angle = self.get(_k.ROTATE, 0)
     self[NameObject(_k.ROTATE)] = NumberObject(current_angle + angle)