def create_blank_page(_pdf=None, width=None, height=None): """ Returns a new blank page. If width or height is None, try to get the page size from the last page of PyPDF. If PyPDF is None or contains no page, a PageSizeNotDefinedError is raised. _pdf -- PDF file the page belongs to width -- The width of the new page expressed in default user space units. height -- The height of the new page expressed in default user space units. """ page = PageObject(_pdf) # Creates a new page (cf PDF Reference 7.7.3.3) page.__setitem__(NameObject(_k.TYPE), NameObject(_k.PAGE)) page.__setitem__(NameObject(b'/Parent'), NullObject()) page.__setitem__(NameObject(_k.RESOURCES), DictObject()) if width is None or height is None: if _pdf is not None and _pdf.get_pages_count() > 0: last_page = _pdf.get_page(_pdf.get_pages_count() - 1) width = last_page.media_box.get_width() height = last_page.media_box.get_height() else: raise utils.PageSizeNotDefinedError() page.__setitem__(NameObject(_k.MEDIA_BOX), RectangleObject([0, 0, width, height])) return page
def _chop_images(self): pages_count = len(self._flattened_pages) for i in range(pages_count): page = self._flattened_pages[i] bytes_data: bytes = page[_k.CONTENT].get_data() parts = _pattern_space.split(bytes_data) the_image = page[_k.RESOURCES][b'/XObject'][_get_image_name_from(parts)] _u.debug(parts, len(page[_k.RESOURCES][b'/XObject']), the_image) image_data = the_image.get_data() # _u.debug(the_image, len(image_data)) if b'/Subtype' in the_image and the_image[b'/Subtype'] == b'/Image': ( width, height, compressed_length, compressed_data, ) = _im.chop_off_image_empty_edges(the_image, image_data, i + 1) the_image[NameObject(b'/Length')] = NumberObject(compressed_length) the_image[NameObject(b'/Width')] = NumberObject(width) the_image[NameObject(b'/Height')] = NumberObject(height) the_image._bytes_data = compressed_data # We might need to insert this matrix in the below line: 1 0 0 1 0 100 cm page[_k.CONTENT].set_data( b'q ' + _u.s2b(str(width)) + b' 0 0 ' + _u.s2b(str(height)) + b' 0 0 cm ' + parts[-5] + b' Do Q' ) page[_k.MEDIA_BOX][2] = NumberObject(width) page[_k.MEDIA_BOX][3] = NumberObject(height) _u.debug( 'Chopped empty edges for {:4}/{} image.'.format(i + 1, pages_count), page[_k.MEDIA_BOX], page[_k.MEDIA_BOX][2:], parts, width, height, page[_k.CONTENT].get_data(), ) else: _u.debug(image_data) pass
def merge_page(self, page2, page2transformation=None): """Merges the content streams of two pages into one. Resource references (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc of this page are not altered. The parameter page's content stream will be added to the end of this page's content stream, meaning that it will be drawn after, or "on top" of this page. page2 - An instance of {@link #PageObject PageObject} to be merged into this one. page2transformation - A function which applies a transformation to the content stream of page2. Takes: page2 contents stream. Must return: new contents stream. If omitted, the content stream will not be modified.""" # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. new_resources = DictObject() rename = {} original_resources = self[_k.RESOURCES].get_object() page2_resources = page2[_k.RESOURCES].get_object() for res in b'/ExtGState', b'/Font', b'/XObject', b'/ColorSpace', b'/Pattern', b'/Shading', b'/Properties': new, new_name = _merge_resources(original_resources, page2_resources, res) if new: new_resources[NameObject(res)] = new rename.update(new_name) # Combine /ProcSet sets. new_resources[NameObject(b'/ProcSet')] = ArrayObject( frozenset( original_resources.get( b'/ProcSet', ArrayObject()).get_object()).union( frozenset( page2_resources.get(b'/ProcSet', ArrayObject()).get_object()))) new_content_array = ArrayObject() original_content = self.get_contents() if original_content is not None: new_content_array.append( _push_pop_graphics_state(original_content, self.parent)) page2_content = page2.get_contents() if page2_content is not None: if page2transformation is not None: page2_content = page2transformation(page2_content) page2_content = _content_stream_rename(page2_content, rename, self.parent) page2_content = _push_pop_graphics_state(page2_content, self.parent) new_content_array.append(page2_content) self[NameObject(_k.CONTENT)] = _ContentStreamObject( new_content_array, self.parent) self[NameObject(_k.RESOURCES)] = new_resources
def __init__(self): self._id = None self._encrypt = None self._encrypt_key = None self._objects = [] # array of indirect objects # The root of our page tree node. pages = DictObject() pages.update({ NameObject(_k.TYPE): NameObject(_k.PAGES), NameObject(_k.COUNT): NumberObject(0), NameObject(_k.KIDS): ArrayObject(), }) self._pages = self._add_object(pages) # info object info = DictObject() info.update({ NameObject(b'/Producer'): create_string_object(b'PyPDF - Refactored by QXF') }) self._info = self._add_object(info) # root object self.__outlines = self._add_object(DictObject()) root = DictObject() root.update({ NameObject(_k.TYPE): NameObject(b'/Catalog'), NameObject(_k.PAGES): self._pages, NameObject(_k.OUTLINES): self.__outlines, }) self._root = self._add_object(root)
def _write_trailer_to(self, stream): stream.write(b'trailer\n') trailer = DictObject() trailer.update({ NameObject(_k.SIZE): NumberObject(len(self._objects) + 1), NameObject(_k.ROOT): self._root, NameObject(_k.INFO): self._info, }) if self._id is not None: trailer[NameObject(_k.ID)] = self._id if self._encrypt is not None: trailer[NameObject(_k.ENCRYPT)] = self._encrypt trailer.write_to_stream(stream)
def _add_page(self, page, callback_add): """ Common method for inserting or adding a page to this PDF file. page - The page to add to the document. This argument should be an instance of {@link #PageObject PageObject}. callback_add - The function which will insert the page in the dictionary. Takes: page list, page to add. """ assert page[_k.TYPE] == _k.PAGE page[NameObject(b'/Parent')] = self._pages page = self._add_object(page) pages = self._pages.get_object() callback_add(pages[_k.KIDS], page) pages[NameObject(_k.COUNT)] = NumberObject(pages[_k.COUNT] + 1)
def add_transformation(self, ctm): original_content = self.get_contents() if original_content is not None: new_content = _add_transformation_matrix(original_content, self.parent, ctm) new_content = _push_pop_graphics_state(new_content, self.parent) self[NameObject(_k.CONTENT)] = new_content
def __parse_xref_stream(self): # PDF 1.5+ Cross-Reference Stream self._stream.seek(-1, io.SEEK_CUR) idnum, generation = _read_object_header(self._stream) xrefstream = read_object(self._stream, self) assert xrefstream[_k.TYPE] == b'/XRef' self._cache_indirect_object(generation, idnum, xrefstream) # _u.debug(xrefstream) stream_data = BytesIO(xrefstream.get_data()) idx_pairs = xrefstream.get(b'/Index', [0, xrefstream.get(_k.SIZE)]) entry_sizes = xrefstream.get(b'/W') for num, size in _generate_pairs(idx_pairs): cnt = 0 xref_type = None byte_offset = None objstr_num = None obstr_idx = None while cnt < size: for i in range(len(entry_sizes)): d = stream_data.read(entry_sizes[i]) di = _convert_to_int(d, entry_sizes[i]) if i == 0: xref_type = di elif i == 1: if xref_type == 0: _next_free_object = di elif xref_type == 1: byte_offset = di elif xref_type == 2: objstr_num = di elif i == 2: if xref_type == 0: _next_generation = di elif xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 0: _u.debug('Ignored xref_type == 0, do not know why.') elif xref_type == 1: if generation not in self._xref: self._xref[generation] = {} if num not in self._xref[generation]: self._xref[generation][num] = byte_offset elif xref_type == 2: if num not in self._xref_obj_stream: self._xref_obj_stream[num] = [objstr_num, obstr_idx] else: _u.debug(f'Unknown xref_type {xref_type}. Ignored.') cnt += 1 num += 1 trailer_keys = _k.ROOT, _k.ENCRYPT, _k.INFO, _k.ID for key in trailer_keys: if key in xrefstream and key not in self._trailer: self._trailer[NameObject(key)] = xrefstream.raw_get(key) if b'/Prev' in xrefstream: return xrefstream[b'/Prev'] else: return None
def _merge_resources(res1, res2, resource): new_res = DictObject() new_res.update(res1.get(resource, DictObject()).get_object()) page2_res = res2.get(resource, DictObject()).get_object() rename_res = {} for key in page2_res.keys(): if key in new_res and new_res[key] != page2_res[key]: new_name = NameObject(key + b'renamed') rename_res[key] = new_name new_res[new_name] = page2_res[key] elif key not in new_res: new_res[key] = page2_res.raw_get(key) return new_res, rename_res
def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True): """Encrypt this PDF file with the PDF Standard encryption handler. user_pwd - The "user password", which allows for opening and reading the PDF file with the restrictions provided. owner_pwd - The "owner password", which allows for opening the PDF files without any restrictions. By default, the owner password is the same as the user password. use_128bit - Boolean argument as to whether to use 128bit encryption. When false, 40bit encryption will be used. By default, this flag is on.""" if owner_pwd is None: owner_pwd = user_pwd if use_128bit: v = 2 rev = 3 keylen = 128 / 8 else: v = 1 rev = 2 keylen = 40 / 8 # permit everything: p = -1 o = ByteStringObject(_u.algorithm_33(owner_pwd, user_pwd, rev, keylen)) id_1 = _md5(bytes(repr(time.time()), _u.ENCODING_UTF8)).digest() id_2 = _md5(bytes(repr(random.random()), _u.ENCODING_UTF8)).digest() self._id = ArrayObject((ByteStringObject(id_1), ByteStringObject(id_2))) if rev == 2: u, key = _u.algorithm_34(user_pwd, o, p, id_1) else: assert rev == 3 u, key = _u.algorithm_35(user_pwd, rev, keylen, o, p, id_1, False) encrypt = DictObject() encrypt[NameObject(b'/Filter')] = NameObject(b'/Standard') encrypt[NameObject(b'/V')] = NumberObject(v) if v == 2: encrypt[NameObject(b'/Length')] = NumberObject(keylen * 8) encrypt[NameObject(b'/R')] = NumberObject(rev) encrypt[NameObject(b'/O')] = ByteStringObject(o) encrypt[NameObject(b'/U')] = ByteStringObject(u) encrypt[NameObject(b'/P')] = NumberObject(p) self._encrypt = self._add_object(encrypt) self._encrypt_key = key
def _build_outline(self, node): dest, title, outline = None, None, None if b'/A' in node and _k.TITLE in node: # Action, section 8.5 (only type GoTo supported) title = node[_k.TITLE] action = node[b'/A'] if action[b'/S'] == b'/GoTo': dest = action[b'/D'] elif _k.DEST in node and _k.TITLE in node: # DestObject, section 8.2.1 title = node[_k.TITLE] dest = node[_k.DEST] # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): outline = _build_destination(title, dest) elif dest in isinstance(dest, str) and self._named_dests: outline = self._named_dests[dest] outline[NameObject(_k.TITLE)] = title else: raise _u.PdfReadError("Unexpected destination %r" % dest) return outline
def get_pages_count(self): pages = self._pages.get_object() return int(pages[NameObject(_k.COUNT)])
import PyPDF.compound as _c import PyPDF.image_tools as _im import PyPDF.keys as _k import PyPDF.utils as _u from PyPDF.generic import ( NameObject, NumberObject, BooleanObject, TextStringObject, RefObject, ByteStringObject, DictObject, ArrayObject, StreamObject, DocInfoObject, DestObject, create_string_object, read_object, ) _pattern_space = _re.compile(br'\s') _inheritable_page_attributes = ( NameObject(_k.RESOURCES), NameObject(_k.MEDIA_BOX), NameObject(_k.CROP_BOX), NameObject(_k.ROTATE), ) class PdfFileReader(object): def __init__(self, stream, should_chop: bool = False): """ Initializes a PdfFileReader object. This operation can take some time, as the PDF stream's cross-reference tables are read into memory. Stability: Added in v1.0, will exist for all v1.x releases. stream - An object that supports the standard read
def _set_rectangle(this, name, value): if not isinstance(name, NameObject): name = NameObject(name) this[name] = value
def compress_content_streams(self): content = self.get_contents() if content is not None: if not isinstance(content, _ContentStreamObject): content = _ContentStreamObject(content, self.parent) self[NameObject(_k.CONTENT)] = content.flate_encode()
def add_bookmark(self, title: str, page_index: int, container_ref=None): container_ref = self.__outlines if container_ref is None else container_ref title_obj = TextStringObject(title) mark_mami = DictObject() target_page = self.get_page_ref(page_index - 1) mark_mami.update({ NameObject(_k.TITLE): title_obj, NameObject(_k.PARENT): container_ref, NameObject(_k.DEST): ArrayObject([target_page, NameObject(b'/Fit')]), }) container = container_ref.get_object() mark_ref = self._add_object(mark_mami) if _k.COUNT in container: container[NameObject(_k.COUNT)] += 1 last_mark = container[NameObject(_k.LAST)] container[_k.LAST] = mark_ref last_mark[_k.NEXT] = mark_ref else: container[NameObject(_k.TYPE)] = NameObject(_k.OUTLINES) container[NameObject(_k.COUNT)] = NumberObject(1) container[NameObject(_k.FIRST)] = mark_ref container[NameObject(_k.LAST)] = mark_ref return mark_ref
def _rotate(self, angle): current_angle = self.get(_k.ROTATE, 0) self[NameObject(_k.ROTATE)] = NumberObject(current_angle + angle)