def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): ''' pagexobj creates and returns a Form XObject for a given view within a page (Defaults to entire page.) ''' inheritable = page.inheritable resources = inheritable.Resources rotation = get_rotation(inheritable.Rotate) mbox, bbox = getrects(inheritable, viewinfo, rotation) rotation += get_rotation(viewinfo.rotate) if isinstance(page.Contents, PdfArray): if len(page.Contents) == 1: contents = page.Contents[0] else: # decompress and join multiple streams contlist = [c for c in page.Contents] uncompress(contlist) stream = '\n'.join([c.stream for c in contlist]) contents = PdfDict(stream=stream) else: contents = page.Contents # Make sure the only attribute is length # All the filters must have been executed assert int(contents.Length) == len(contents.stream) if not allow_compressed: assert len([x for x in contents.iteritems()]) == 1 return _cache_xobj(contents, resources, mbox, bbox, rotation)
def _cache_xobj(contents, resources, mbox, bbox, rotation): ''' Return a cached Form XObject, or create a new one and cache it. Adds private members x, y, w, h ''' cachedict = contents.xobj_cachedict cachekey = mbox, bbox, rotation result = cachedict.get(cachekey) if result is None: func = (_get_fullpage, _get_subpage)[mbox != bbox] result = PdfDict( func(contents, resources, mbox, bbox, rotation), Type=PdfName.XObject, Subtype=PdfName.Form, FormType=1, BBox=PdfArray(bbox), ) rect = bbox if rotation: matrix = (rotate_point((1, 0), rotation) + rotate_point((0, 1), rotation)) result.Matrix = PdfArray(matrix + (0, 0)) rect = rotate_rect(rect, rotation) result.private.x = rect[0] result.private.y = rect[1] result.private.w = rect[2] - rect[0] result.private.h = rect[3] - rect[1] cachedict[cachekey] = result return result
def _build_cache(contents, allow_compressed): ''' Build a new dictionary holding the stream, and save it along with private cache info. Assumes validity has been pre-checked if we have a non-None xobj_copy. ''' try: xobj_copy = contents.xobj_copy except AttributeError: # Should have a PdfArray here... array = contents private = contents else: # Should have a PdfDict here -- might or might not have cache copy if xobj_copy is not None: return xobj_copy array = [contents] private = contents.private # The spec says nothing about nested arrays. Will # assume that's not a problem until we encounter them... xobj_copy = PdfDict(array[0]) xobj_copy.private.xobj_cachedict = {} private.xobj_copy = xobj_copy if len(array) > 1: newstream = '\n'.join(x.stream for x in array) newlength = sum(int(x.Length) for x in array) + len(array) - 1 assert newlength == len(newstream) xobj_copy.stream = newstream # Cannot currently cope with different kinds of # compression in the array, so just disallow it. allow_compressed = False if not allow_compressed: # Make sure there are no compression parameters for cdict in array: assert len([x for x in cdict.iteritems()]) == 1 cachedict = contents.cachedict = {} return xobj_copy
def get_jbig2_images(psem): # Convert images with ImageMagick to bitonal png in parallel yield from asyncio.gather(*[ run_command_async([ CONVERT_CMD, "-alpha", "remove", "-alpha", "off", "-colorspace", "gray", "-threshold", "50%", path.abspath(image.filename), path.abspath(path.join(temp_dir, "input.%d.png" % i))], psem) for i, image in enumerate(images_with_shared_globals)]) cmd = [JBIG2_CMD, "-p"] if symbol_mode: cmd.extend(["-s", "-t", format_number(self.jbig2_threshold, 4)]) for i, _ in enumerate(images_with_shared_globals): cmd.append(path.abspath(path.join(temp_dir, "input.%d.png" % i))) jbig2_images = [] jbig2_globals = None if symbol_mode: yield from run_command_async(cmd, psem, cwd=temp_dir) jbig2_globals = PdfDict() jbig2_globals.indirect = True with open(path.join(temp_dir, "output.sym"), "rb") as f: jbig2_globals.stream = f.read().decode("latin-1") for i, _ in enumerate(images_with_shared_globals): with open(path.join(temp_dir, "output.%04d" % i), "rb") as f: jbig2_images.append(f.read()) else: jbig2_images.append( (yield from run_command_async(cmd, psem, cwd=temp_dir))) return jbig2_images, jbig2_globals
def _get_trailer(self): trailer = self._trailer if trailer is not None: return trailer # Create the basic object structure of the PDF file trailer = PdfDict(Root=IndirectPdfDict( Type=PdfName.Catalog, Pages=IndirectPdfDict(Type=PdfName.Pages, Count=PdfObject(len(self.pagearray)), Kids=self.pagearray))) # Make all the pages point back to the page dictionary and # ensure they are indirect references pagedict = trailer.Root.Pages for page in pagedict.Kids: page.Parent = pagedict self._trailer = trailer return trailer
def _add_font_resources(resources, A): if A.fonts: resources.Font = PdfDict() for font_name, font in A.fonts.items(): resources.Font[PdfName(font_name)] = font
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) self.version = fdata[5:8] endloc = fdata.rfind('%EOF') if endloc < 0: raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken startloc, source = self.findxref(fdata) private.source = source xref_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables/streams trailer = self.parsexref(source) # Loop if any previously-written xrefs. prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref': source.warning( 'Expected "startxref" at end of xref table') break if not xref_list: trailer.Prev = None original_trailer = trailer source.floc = int(prev) xref_list.append(source.obj_offsets) if xref_list: for update in reversed(xref_list): source.obj_offsets.update(update) trailer.update(original_trailer) if trailer.Version and \ float(trailer.Version) > float(self.version): self.version = trailer.Version trailer = PdfDict(Root=trailer.Root, Info=trailer.Info, ID=trailer.ID # TODO: add Encrypt when implemented ) self.update(trailer) #self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() finally: if disable_gc: gc.enable()
def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather(*[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather(*[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray([0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height)] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages))
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather(*[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather(*[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray([0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height)] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather( *[make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages)]) with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n"] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile)]) yield from run_command_async(cmd, process_semaphore)
def _build_font(): with open(FONT_FILENAME, "rb") as f: embedded_font_stream = f.read() embedded_font = PdfDict() embedded_font.indirect = True embedded_font.Filter = [PdfName.FlateDecode] embedded_font.stream = zlib.compress(embedded_font_stream, 9).decode( "latin-1") embedded_font.Length1 = len(embedded_font_stream) font_descriptor = PdfDict() font_descriptor.indirect = True font_descriptor.Ascent = 1000 font_descriptor.CapHeight = 1000 font_descriptor.Descent = -1 font_descriptor.Flags = 5 # FixedPitch + Symbolic font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500]) font_descriptor.FontFile2 = embedded_font font_descriptor.FontName = PdfName.GlyphLessFont font_descriptor.ItalicAngle = 0 font_descriptor.StemV = 80 font_descriptor.Type = PdfName.FontDescriptor # Map everything to glyph 1 cid_to_gid_map_stream = b"\0\1" * (1 << 16) cid_to_gid_map = PdfDict() cid_to_gid_map.indirect = True cid_to_gid_map.Filter = [PdfName.FlateDecode] cid_to_gid_map.stream = zlib.compress( cid_to_gid_map_stream, 9).decode("latin-1") cid_to_gid_map.Length1 = len(cid_to_gid_map_stream) cid_system_info = PdfDict() cid_system_info.Ordering = PdfString.from_unicode("Identity") cid_system_info.Registry = PdfString.from_unicode("Adobe") cid_system_info.Supplement = 0 cid_font = PdfDict() cid_font.indirect = True cid_font.CIDToGIDMap = cid_to_gid_map cid_font.BaseFont = PdfName.GlyphLessFont cid_font.CIDSystemInfo = cid_system_info cid_font.FontDescriptor = font_descriptor cid_font.Subtype = PdfName.CIDFontType2 cid_font.Type = PdfName.Font cid_font.DW = 500 with open(UNICODE_CMAP_FILENAME, "rb") as f: unicode_cmap_stream = f.read() unicode_cmap = PdfDict() unicode_cmap.indirect = True unicode_cmap.Filter = [PdfName.FlateDecode] unicode_cmap.stream = zlib.compress(unicode_cmap_stream, 9).decode( "latin-1") font = PdfDict() font.indirect = True font.BaseFont = PdfName.GlyphLessFont font.DescendantFonts = PdfArray([cid_font]) font.Encoding = PdfName("Identity-H") font.Subtype = PdfName.Type0 font.ToUnicode = unicode_cmap font.Type = PdfName.Font return font
def _pdf_image(self, psem): with TemporaryDirectory(prefix="djpdf-") as temp_dir: # JBIG2Globals are only used in symbol mode # In symbol mode jbig2 writes output to files otherwise # it's written to stdout symbol_mode = self.jbig2_threshold != 1 images_with_shared_globals = [] if symbol_mode and SHARE_JBIG2_GLOBALS: # Find all Jbig2Images that share the same symbol directory for obj in self._factory._cache: if (isinstance(obj, Jbig2Image) and self.compression == obj.compression and self.jbig2_threshold == obj.jbig2_threshold): images_with_shared_globals.append(obj) else: # The symbol directory is not shared with other Jbig2Images images_with_shared_globals.append(self) # Promise all handled Jbig2Images the finished image image_futures = [] my_image_future = None for image in images_with_shared_globals: future = asyncio.Future() asyncio.ensure_future(image._cache.get(future)) image_futures.append(future) if image is self: my_image_future = future # All futures are in place, the lock can be released self._factory._cache_lock.release() self._cache_lock_acquired = False # Prepare everything in parallel @asyncio.coroutine def get_jbig2_images(psem): # Convert images with ImageMagick to bitonal png in parallel yield from asyncio.gather(*[ run_command_async([ CONVERT_CMD, "-alpha", "remove", "-alpha", "off", "-colorspace", "gray", "-threshold", "50%", path.abspath(image.filename), path.abspath(path.join(temp_dir, "input.%d.png" % i))], psem) for i, image in enumerate(images_with_shared_globals)]) cmd = [JBIG2_CMD, "-p"] if symbol_mode: cmd.extend(["-s", "-t", format_number(self.jbig2_threshold, 4)]) for i, _ in enumerate(images_with_shared_globals): cmd.append(path.abspath(path.join(temp_dir, "input.%d.png" % i))) jbig2_images = [] jbig2_globals = None if symbol_mode: yield from run_command_async(cmd, psem, cwd=temp_dir) jbig2_globals = PdfDict() jbig2_globals.indirect = True with open(path.join(temp_dir, "output.sym"), "rb") as f: jbig2_globals.stream = f.read().decode("latin-1") for i, _ in enumerate(images_with_shared_globals): with open(path.join(temp_dir, "output.%04d" % i), "rb") as f: jbig2_images.append(f.read()) else: jbig2_images.append( (yield from run_command_async(cmd, psem, cwd=temp_dir))) return jbig2_images, jbig2_globals @asyncio.coroutine def get_image_mask(image, psem): if image._mask is None: return None return (yield from image._mask.pdf_image(psem)) ((jbig2_images, jbig2_globals), image_masks) = yield from asyncio.gather( get_jbig2_images(psem), asyncio.gather(*[get_image_mask(image, psem) for image in images_with_shared_globals])) for image, jbig2_image, image_mask, image_future in zip( images_with_shared_globals, jbig2_images, image_masks, image_futures): (width, height, xres, yres) = struct.unpack( '>IIII', jbig2_image[11:27]) pdf_image = PdfDict() pdf_image.indirect = True pdf_image.Type = PdfName.XObject pdf_image.Subtype = PdfName.Image pdf_image.Width = width pdf_image.Height = height if image._image_mask: pdf_image.ImageMask = PdfBool(True) else: pdf_image.ColorSpace = PdfName.DeviceGray if image_mask is not None: pdf_image.Mask = image_mask pdf_image.BitsPerComponent = 1 pdf_image.Filter = [PdfName.JBIG2Decode] if symbol_mode: pdf_image.DecodeParms = [{ PdfName.JBIG2Globals: jbig2_globals}] pdf_image.stream = jbig2_image.decode("latin-1") image_future.set_result(pdf_image) return my_image_future.result()
def _get_fullpage(contents, resources, mbox, bbox, rotation): ''' fullpage is easy. Just copy the contents, set up the resources, and let _cache_xobj handle the rest. ''' return PdfDict(contents, Resources=resources)
def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages))
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) srgb_colorspace = PdfDict() srgb_colorspace.indirect = True srgb_colorspace.N = 3 # Number of components (red, green, blue) with open(SRGB_ICC_FILENAME, "rb") as f: srgb_colorspace_stream = f.read() srgb_colorspace.Filter = [PdfName.FlateDecode] srgb_colorspace.stream = zlib.compress(srgb_colorspace_stream, 9).decode("latin-1") srgb_colorspace.Length1 = len(srgb_colorspace_stream) default_rgb_colorspace = PdfArray([PdfName.ICCBased, srgb_colorspace]) default_rgb_colorspace.indirect = True # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather(*[ make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages) ]) trailer = pdf_writer.trailer document_id = PdfString().from_bytes(os.urandom(16)) trailer.ID = [document_id, document_id] mark_info = PdfDict() mark_info.Marked = PdfBool(True) trailer.Root.MarkInfo = mark_info struct_tree_root = PdfDict() struct_tree_root.Type = PdfName.StructTreeRoot trailer.Root.StructTreeRoot = struct_tree_root metadata = PdfDict() metadata.indirect = True metadata.Type = PdfName.Metadata metadata.Subtype = PdfName.XML xmp = XMPMeta() xmp.set_property(XMP_NS_PDFA_ID, "part", "2") xmp.set_property(XMP_NS_PDFA_ID, "conformance", "A") metadata_stream = xmp.serialize_to_str().encode("utf-8") metadata.Filter = [PdfName.FlateDecode] metadata.stream = zlib.compress(metadata_stream, 9).decode("latin-1") metadata.Length1 = len(metadata_stream) trailer.Root.Metadata = metadata with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [ QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n", "--newline-before-endstream" ] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([ path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile) ]) yield from run_command_async(cmd, process_semaphore)
def _build_font(): with open(FONT_FILENAME, "rb") as f: embedded_font_stream = f.read() embedded_font = PdfDict() embedded_font.indirect = True embedded_font.Filter = [PdfName.FlateDecode] embedded_font.stream = zlib.compress(embedded_font_stream, 9).decode("latin-1") embedded_font.Length1 = len(embedded_font_stream) font_descriptor = PdfDict() font_descriptor.indirect = True font_descriptor.Ascent = 1000 font_descriptor.CapHeight = 1000 font_descriptor.Descent = -1 font_descriptor.Flags = 5 # FixedPitch + Symbolic font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500]) font_descriptor.FontFile2 = embedded_font font_descriptor.FontName = PdfName.GlyphLessFont font_descriptor.ItalicAngle = 0 font_descriptor.StemV = 80 font_descriptor.Type = PdfName.FontDescriptor # Map everything to glyph 1 cid_to_gid_map_stream = b"\0\1" * (1 << 16) cid_to_gid_map = PdfDict() cid_to_gid_map.indirect = True cid_to_gid_map.Filter = [PdfName.FlateDecode] cid_to_gid_map.stream = zlib.compress(cid_to_gid_map_stream, 9).decode("latin-1") cid_to_gid_map.Length1 = len(cid_to_gid_map_stream) cid_system_info = PdfDict() cid_system_info.Ordering = PdfString.from_unicode("Identity") cid_system_info.Registry = PdfString.from_unicode("Adobe") cid_system_info.Supplement = 0 cid_font = PdfDict() cid_font.indirect = True cid_font.CIDToGIDMap = cid_to_gid_map cid_font.BaseFont = PdfName.GlyphLessFont cid_font.CIDSystemInfo = cid_system_info cid_font.FontDescriptor = font_descriptor cid_font.Subtype = PdfName.CIDFontType2 cid_font.Type = PdfName.Font cid_font.DW = 500 with open(UNICODE_CMAP_FILENAME, "rb") as f: unicode_cmap_stream = f.read() unicode_cmap = PdfDict() unicode_cmap.indirect = True unicode_cmap.Filter = [PdfName.FlateDecode] unicode_cmap.stream = zlib.compress(unicode_cmap_stream, 9).decode("latin-1") font = PdfDict() font.indirect = True font.BaseFont = PdfName.GlyphLessFont font.DescendantFonts = PdfArray([cid_font]) font.Encoding = PdfName("Identity-H") font.Subtype = PdfName.Type0 font.ToUnicode = unicode_cmap font.Type = PdfName.Font return font
def _pdf_image(self, psem): with TemporaryDirectory(prefix="djpdf-") as temp_dir: # JBIG2Globals are only used in symbol mode # In symbol mode jbig2 writes output to files otherwise # it's written to stdout symbol_mode = self.jbig2_threshold != 1 images_with_shared_globals = [] if symbol_mode and SHARE_JBIG2_GLOBALS: # Find all Jbig2Images that share the same symbol directory for obj in self._factory._cache: if (isinstance(obj, Jbig2Image) and self.compression == obj.compression and self.jbig2_threshold == obj.jbig2_threshold): images_with_shared_globals.append(obj) else: # The symbol directory is not shared with other Jbig2Images images_with_shared_globals.append(self) # Promise all handled Jbig2Images the finished image image_futures = [] my_image_future = None for image in images_with_shared_globals: future = asyncio.Future() asyncio.ensure_future(image._cache.get(future)) image_futures.append(future) if image is self: my_image_future = future # All futures are in place, the lock can be released self._factory._cache_lock.release() self._cache_lock_acquired = False # Prepare everything in parallel @asyncio.coroutine def get_jbig2_images(psem): # Convert images with ImageMagick to bitonal png in parallel yield from asyncio.gather(*[ run_command_async([ CONVERT_CMD, "-alpha", "remove", "-alpha", "off", "-colorspace", "gray", "-threshold", "50%", path.abspath(image.filename), path.abspath(path.join(temp_dir, "input.%d.png" % i)) ], psem) for i, image in enumerate(images_with_shared_globals) ]) cmd = [JBIG2_CMD, "-p"] if symbol_mode: cmd.extend( ["-s", "-t", format_number(self.jbig2_threshold, 4)]) for i, _ in enumerate(images_with_shared_globals): cmd.append( path.abspath(path.join(temp_dir, "input.%d.png" % i))) jbig2_images = [] jbig2_globals = None if symbol_mode: yield from run_command_async(cmd, psem, cwd=temp_dir) jbig2_globals = PdfDict() jbig2_globals.indirect = True with open(path.join(temp_dir, "output.sym"), "rb") as f: jbig2_globals.stream = f.read().decode("latin-1") for i, _ in enumerate(images_with_shared_globals): with open(path.join(temp_dir, "output.%04d" % i), "rb") as f: jbig2_images.append(f.read()) else: jbig2_images.append((yield from run_command_async(cmd, psem, cwd=temp_dir))) return jbig2_images, jbig2_globals @asyncio.coroutine def get_image_mask(image, psem): if image._mask is None: return None return (yield from image._mask.pdf_image(psem)) ((jbig2_images, jbig2_globals), image_masks) = yield from asyncio.gather( get_jbig2_images(psem), asyncio.gather(*[ get_image_mask(image, psem) for image in images_with_shared_globals ])) for image, jbig2_image, image_mask, image_future in zip( images_with_shared_globals, jbig2_images, image_masks, image_futures): (width, height, xres, yres) = struct.unpack('>IIII', jbig2_image[11:27]) pdf_image = PdfDict() pdf_image.indirect = True pdf_image.Type = PdfName.XObject pdf_image.Subtype = PdfName.Image pdf_image.Width = width pdf_image.Height = height if image._image_mask: pdf_image.ImageMask = PdfBool(True) else: # NOTE: DefaultGray color space is required for PDF/A pdf_image.ColorSpace = PdfName.DeviceGray if image_mask is not None: pdf_image.Mask = image_mask pdf_image.BitsPerComponent = 1 pdf_image.Filter = [PdfName.JBIG2Decode] if symbol_mode: pdf_image.DecodeParms = [{ PdfName.JBIG2Globals: jbig2_globals }] pdf_image.stream = jbig2_image.decode("latin-1") image_future.set_result(pdf_image) return my_image_future.result()