def wrap_object(obj, width, margin): ''' Wrap an xobj in its own page object. ''' fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q' contents = PdfDict(indirect=True) subtype = obj.Subtype if subtype == PdfName.Form: contents._stream = obj.stream contents.Length = obj.Length contents.Filter = obj.Filter contents.DecodeParms = obj.DecodeParms resources = obj.Resources mbox = obj.BBox elif subtype == PdfName.Image: # Image xoffset = margin[0] yoffset = margin[1] cw = width - margin[0] - margin[2] iw, ih = float(obj.Width), float(obj.Height) ch = 1.0 * cw / iw * ih height = ch + margin[1] + margin[3] p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset)) contents.stream = fmt % p resources = PdfDict(XObject=PdfDict(MyImage=obj)) mbox = PdfArray((0, 0, width, height)) else: raise TypeError("Expected Form or Image XObject") return PdfDict( indirect=True, Type=PdfName.Page, MediaBox=mbox, Resources=resources, Contents=contents, )
def _get_subpage(contents, resources, mbox, bbox, rotation): ''' subpages *could* be as easy as full pages, but we choose to complicate life by creating a Form XObject for the page, and then one that references it for the subpage, on the off-chance that we want multiple items from the page. ''' return PdfDict( stream='/FullPage Do\n', Resources=PdfDict(XObject=PdfDict( FullPage=_cache_xobj(contents, resources, mbox, mbox, 0))))
def readdict(self, source, PdfDict=PdfDict): ''' Found a << token. Parse the tokens after that. ''' specialget = self.special.get result = PdfDict() next = source.next tok = next() while tok != '>>': if not tok.startswith('/'): # Just skip the incorrect /name object. source.warning('Expected PDF /name object') tok = next() continue key = tok value = next() func = specialget(value) if func is not None: # Just keep working when bad token occurs. if func == self.badtoken: tok = value continue value = func(source) tok = next() else: tok = next() if value.isdigit() and tok.isdigit(): if next() != 'R': source.exception('Expected "R" following two integers') value = self.findindirect(value, tok) tok = next() result[key] = value return result
def as_pdf_object(self, transform, page): """Return the PdfDict object representing the annotation, that will be inserted as is into the PDF document. :param list transform: Transformation matrix to transform the coords of the annotation from client-specified space to PDF user space. :param PdfDict page: The pdfrw page object from the PDF document :returns PdfDict: the annotation object to be inserted into the PDF """ bounding_box = transform_rect(self.make_rect(), transform) appearance_stream = self._make_appearance_stream_dict( bounding_box, transform, ) obj = PdfDict( Type=PdfName('Annot'), Subtype=PdfName(self.subtype), Rect=bounding_box, AP=appearance_stream, ) if self._page_as_parent: obj.P = page for name in self._related: subobject = self._related[name].as_pdf_object(transform, page) setattr(obj, name, subobject) self._add_metadata(obj, self._metadata) self.add_additional_pdf_object_data(obj) obj.indirect = True return obj
def _cache_xobj(contents, resources, mbox, bbox, rotation): ''' Return a cached Form XObject, or create a new one and cache it. Adds private members x, y, w, h ''' cachedict = contents.xobj_cachedict if cachedict is None: cachedict = contents.private.xobj_cachedict = {} cachekey = mbox, bbox, rotation result = cachedict.get(cachekey) if result is None: func = (_get_fullpage, _get_subpage)[mbox != bbox] result = PdfDict( func(contents, resources, mbox, bbox, rotation), Type=PdfName.XObject, Subtype=PdfName.Form, FormType=1, BBox=PdfArray(bbox), ) rect = bbox if rotation: matrix = rotate_point((1, 0), rotation) + \ rotate_point((0, 1), rotation) result.Matrix = PdfArray(matrix + (0, 0)) rect = rotate_rect(rect, rotation) result.private.x = rect[0] result.private.y = rect[1] result.private.w = rect[2] - rect[0] result.private.h = rect[3] - rect[1] cachedict[cachekey] = result return result
def readdict(self, source, PdfDict=PdfDict): ''' Found a << token. Parse the tokens after that. ''' specialget = self.special.get result = PdfDict() next = source.next tok = next() while tok != '>>': if not tok.startswith('/'): source.exception('Expected PDF /name object') key = tok value = next() func = specialget(value) if func is not None: value = func(source) tok = next() else: tok = next() if value.isdigit() and tok.isdigit(): if next() != 'R': source.exception('Expected "R" following two integers') value = self.findindirect(value, tok) tok = next() result[key] = value return result
def _add_graphics_state_resources(resources, A): """Add in the resources dict for turning on transparency in the graphics state. For example, if both stroke and fill were transparent, this would add: << /ExtGState /PdfAnnotatorGS << /CA 0.5 /ca 0.75 /Type /ExtGState >> >> to the Resources dict. Graphics states can also be specified externally, for use in explicit content streams. This is done by using the `graphics_states` property on the appearance object. """ states = [] internal_state = Annotation._get_internal_graphics_state(resources, A) if internal_state is not None: states.append((GRAPHICS_STATE_NAME, internal_state)) if A.graphics_states: for name, state in A.graphics_states.items(): states.append((name, state.as_pdf_dict())) if states: resources.ExtGState = PdfDict() for name, state in states: resources.ExtGState[PdfName(name)] = state
def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): ''' pagexobj creates and returns a Form XObject for a given view within a page (Defaults to entire page.) ''' inheritable = page.inheritable resources = inheritable.Resources rotation = get_rotation(inheritable.Rotate) mbox, bbox = getrects(inheritable, viewinfo, rotation) rotation += get_rotation(viewinfo.rotate) if isinstance(page.Contents, PdfArray): if len(page.Contents) == 1: contents = page.Contents[0] else: # decompress and join multiple streams contlist = [c for c in page.Contents] uncompress(contlist) stream = '\n'.join([c.stream for c in contlist]) contents = PdfDict(stream=stream) else: contents = page.Contents # Make sure the only attribute is length # All the filters must have been executed assert int(contents.Length) == len(contents.stream) if not allow_compressed: assert len([x for x in contents.iteritems()]) == 1 return _cache_xobj(contents, resources, mbox, bbox, rotation)
def uncompress(mylist, leave_raw=False, warnings=set(), flate=PdfName.FlateDecode, decompress=decompressobj, isinstance=isinstance, list=list, len=len): ok = True for obj in streamobjects(mylist): ftype = obj.Filter if ftype is None: continue if isinstance(ftype, list) and len(ftype) == 1: # todo: multiple filters ftype = ftype[0] parms = obj.DecodeParms or obj.DP if ftype != flate: msg = ('Not decompressing: cannot use filter %s' ' with parameters %s') % (repr(ftype), repr(parms)) if msg not in warnings: warnings.add(msg) log.warning(msg) ok = False else: dco = decompress() try: data = dco.decompress(convert_store(obj.stream)) except Exception as s: error = str(s) else: error = None if isinstance(parms, PdfArray): oldparms = parms parms = PdfDict() for x in oldparms: parms.update(x) if parms: predictor = int(parms.Predictor or 1) columns = int(parms.Columns or 1) colors = int(parms.Colors or 1) bpc = int(parms.BitsPerComponent or 8) if 10 <= predictor <= 15: data, error = flate_png(data, predictor, columns, colors, bpc) elif predictor != 1: error = ('Unsupported flatedecode predictor %s' % repr(predictor)) if error is None: assert not dco.unconsumed_tail if dco.unused_data.strip(): error = ('Unconsumed compression data: %s' % repr(dco.unused_data[:20])) if error is None: obj.Filter = None obj.stream = data if leave_raw else convert_load(data) else: log.error('%s %s' % (error, repr(obj.indirect))) ok = False return ok
def _add_xobject_resources(resources, A): """Adds in provided, explicit XObjects into the appearance stream's Resources dict. This is used when the user is explicitly specifying the appearance stream and they want to include, say, an image. """ if A.xobjects: resources.XObject = PdfDict() for xobject_name, xobject in A.xobjects.items(): resources.XObject[PdfName(xobject_name)] = xobject
def _make_border_dict(width, style, dash_array=None): border = PdfDict( Type=PdfName('Border'), W=width, S=PdfName(style), ) if dash_array: if style != 'D': raise ValueError('Dash array only applies to dashed borders!') border.D = dash_array return border
def _make_ap_resources(self): """Make the Resources entry for the appearance stream dictionary. Implement add_additional_resources to add additional entries - fonts, XObjects, graphics state - to the Resources dictionary. """ resources = PdfDict(ProcSet=PdfName('PDF')) self._add_graphics_state_resources(resources, self._appearance) self._add_xobject_resources(resources, self._appearance) self._add_font_resources(resources, self._appearance) self.add_additional_resources(resources) return resources
def _make_appearance_stream_dict(self, bounding_box, transform): resources = self._make_ap_resources() # Either use user-specified content stream or generate content stream # based on annotation type. stream = self._appearance.appearance_stream if stream is None: stream = self.make_appearance_stream() # Transform the appearance stream into PDF space and turn it into a str appearance_stream = stream.transform(transform).resolve() normal_appearance = PdfDict( stream=appearance_stream, BBox=bounding_box, Resources=resources, Matrix=translate(-bounding_box[0], -bounding_box[1]), Type=PdfName('XObject'), Subtype=PdfName('Form'), FormType=1, ) return PdfDict(N=normal_appearance)
def pdfr2l(a_src, a_dest): """ pdfr2l """ # -------------------------------------------------------------------------- src = PdfReader(a_src) dest = PdfWriter() # dest.addpages(src.pages) dest.trailer = src # print dest.trailer.Root if dest.trailer.Root.ViewerPreferences: dest.trailer.Root.ViewerPreferences = PdfDict(Direction=PdfName.R2L) else: dest.trailer.Root.ViewerPreferences = PdfDict() dest.trailer.Root.ViewerPreferences.Direction = PdfName.R2L dest.trailer.Root.PageLayout = PdfName.TwoColumnRight dest.write(a_dest)
def _get_trailer(self): trailer = self._trailer if trailer is not None: return trailer # Create the basic object structure of the PDF file trailer = PdfDict(Root=IndirectPdfDict( Type=PdfName.Catalog, Pages=IndirectPdfDict(Type=PdfName.Pages, Count=PdfObject(len(self.pagearray)), Kids=self.pagearray))) # Make all the pages point back to the page dictionary and # ensure they are indirect references pagedict = trailer.Root.Pages for page in pagedict.Kids: page.Parent = pagedict self._trailer = trailer return trailer
def _build_cache(contents, allow_compressed): ''' Build a new dictionary holding the stream, and save it along with private cache info. Assumes validity has been pre-checked if we have a non-None xobj_copy. ''' try: xobj_copy = contents.xobj_copy except AttributeError: # Should have a PdfArray here... array = contents private = contents else: # Should have a PdfDict here -- might or might not have cache copy if xobj_copy is not None: return xobj_copy array = [contents] private = contents.private # The spec says nothing about nested arrays. Will # assume that's not a problem until we encounter them... xobj_copy = PdfDict(array[0]) xobj_copy.private.xobj_cachedict = {} private.xobj_copy = xobj_copy if len(array) > 1: newstream = '\n'.join(x.stream for x in array) newlength = sum(int(x.Length) for x in array) + len(array) - 1 assert newlength == len(newstream) xobj_copy.stream = newstream # Cannot currently cope with different kinds of # compression in the array, so just disallow it. allow_compressed = False if not allow_compressed: # Make sure there are no compression parameters for cdict in array: assert len([x for x in cdict.iteritems()]) == 1 cachedict = contents.cachedict = {} return xobj_copy
def get_jbig2_images(psem): # Convert images with ImageMagick to bitonal png in parallel yield from asyncio.gather(*[ run_command_async([ CONVERT_CMD, "-alpha", "remove", "-alpha", "off", "-colorspace", "gray", "-threshold", "50%", path.abspath(image.filename), path.abspath(path.join(temp_dir, "input.%d.png" % i)) ], psem) for i, image in enumerate(images_with_shared_globals) ]) cmd = [JBIG2_CMD, "-p"] if symbol_mode: cmd.extend( ["-s", "-t", format_number(self.jbig2_threshold, 4)]) for i, _ in enumerate(images_with_shared_globals): cmd.append( path.abspath(path.join(temp_dir, "input.%d.png" % i))) jbig2_images = [] jbig2_globals = None if symbol_mode: yield from run_command_async(cmd, psem, cwd=temp_dir) jbig2_globals = PdfDict() jbig2_globals.indirect = True with open(path.join(temp_dir, "output.sym"), "rb") as f: jbig2_globals.stream = f.read().decode("latin-1") for i, _ in enumerate(images_with_shared_globals): with open(path.join(temp_dir, "output.%04d" % i), "rb") as f: jbig2_images.append(f.read()) else: jbig2_images.append((yield from run_command_async(cmd, psem, cwd=temp_dir))) return jbig2_images, jbig2_globals
def _build_font(): with open(FONT_FILENAME, "rb") as f: embedded_font_stream = f.read() embedded_font = PdfDict() embedded_font.indirect = True embedded_font.Filter = [PdfName.FlateDecode] embedded_font.stream = zlib.compress(embedded_font_stream, 9).decode("latin-1") embedded_font.Length1 = len(embedded_font_stream) font_descriptor = PdfDict() font_descriptor.indirect = True font_descriptor.Ascent = 1000 font_descriptor.CapHeight = 1000 font_descriptor.Descent = -1 font_descriptor.Flags = 5 # FixedPitch + Symbolic font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500]) font_descriptor.FontFile2 = embedded_font font_descriptor.FontName = PdfName.GlyphLessFont font_descriptor.ItalicAngle = 0 font_descriptor.StemV = 80 font_descriptor.Type = PdfName.FontDescriptor # Map everything to glyph 1 cid_to_gid_map_stream = b"\0\1" * (1 << 16) cid_to_gid_map = PdfDict() cid_to_gid_map.indirect = True cid_to_gid_map.Filter = [PdfName.FlateDecode] cid_to_gid_map.stream = zlib.compress(cid_to_gid_map_stream, 9).decode("latin-1") cid_to_gid_map.Length1 = len(cid_to_gid_map_stream) cid_system_info = PdfDict() cid_system_info.Ordering = PdfString.from_unicode("Identity") cid_system_info.Registry = PdfString.from_unicode("Adobe") cid_system_info.Supplement = 0 cid_font = PdfDict() cid_font.indirect = True cid_font.CIDToGIDMap = cid_to_gid_map cid_font.BaseFont = PdfName.GlyphLessFont cid_font.CIDSystemInfo = cid_system_info cid_font.FontDescriptor = font_descriptor cid_font.Subtype = PdfName.CIDFontType2 cid_font.Type = PdfName.Font cid_font.DW = 500 with open(UNICODE_CMAP_FILENAME, "rb") as f: unicode_cmap_stream = f.read() unicode_cmap = PdfDict() unicode_cmap.indirect = True unicode_cmap.Filter = [PdfName.FlateDecode] unicode_cmap.stream = zlib.compress(unicode_cmap_stream, 9).decode("latin-1") font = PdfDict() font.indirect = True font.BaseFont = PdfName.GlyphLessFont font.DescendantFonts = PdfArray([cid_font]) font.Encoding = PdfName("Identity-H") font.Subtype = PdfName.Type0 font.ToUnicode = unicode_cmap font.Type = PdfName.Font return font
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) srgb_colorspace = PdfDict() srgb_colorspace.indirect = True srgb_colorspace.N = 3 # Number of components (red, green, blue) with open(SRGB_ICC_FILENAME, "rb") as f: srgb_colorspace_stream = f.read() srgb_colorspace.Filter = [PdfName.FlateDecode] srgb_colorspace.stream = zlib.compress(srgb_colorspace_stream, 9).decode("latin-1") srgb_colorspace.Length1 = len(srgb_colorspace_stream) default_rgb_colorspace = PdfArray([PdfName.ICCBased, srgb_colorspace]) default_rgb_colorspace.indirect = True # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather(*[ make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages) ]) trailer = pdf_writer.trailer document_id = PdfString().from_bytes(os.urandom(16)) trailer.ID = [document_id, document_id] mark_info = PdfDict() mark_info.Marked = PdfBool(True) trailer.Root.MarkInfo = mark_info struct_tree_root = PdfDict() struct_tree_root.Type = PdfName.StructTreeRoot trailer.Root.StructTreeRoot = struct_tree_root metadata = PdfDict() metadata.indirect = True metadata.Type = PdfName.Metadata metadata.Subtype = PdfName.XML xmp = XMPMeta() xmp.set_property(XMP_NS_PDFA_ID, "part", "2") xmp.set_property(XMP_NS_PDFA_ID, "conformance", "A") metadata_stream = xmp.serialize_to_str().encode("utf-8") metadata.Filter = [PdfName.FlateDecode] metadata.stream = zlib.compress(metadata_stream, 9).decode("latin-1") metadata.Length1 = len(metadata_stream) trailer.Root.Metadata = metadata with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [ QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n", "--newline-before-endstream" ] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([ path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile) ]) yield from run_command_async(cmd, process_semaphore)
def _add_font_resources(resources, A): if A.fonts: resources.Font = PdfDict() for font_name, font in A.fonts.items(): resources.Font[PdfName(font_name)] = font
def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages))
def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True): # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() try: if disable_gc: gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf if hasattr(fname, 'read'): fdata = fname.read() else: try: f = open(fname, 'rb') fdata = f.read() f.close() except IOError: raise PdfParseError('Could not read PDF file %s' % fname) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') if startloc >= 0: log.warning('PDF header not at beginning of file') else: lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) self.version = fdata[5:8] endloc = fdata.rfind('%EOF') if endloc < 0: raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] if junk.rstrip('\00').strip(): log.warning('Extra data at end of file') private = self.private private.indirect_objects = {} private.deferred_objects = set() private.special = { '<<': self.readdict, '[': self.readarray, 'endobj': self.empty_obj, } for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken startloc, source = self.findxref(fdata) private.source = source xref_list = [] source.all_offsets = [] while 1: source.obj_offsets = {} # Loop through all the cross-reference tables/streams trailer = self.parsexref(source) # Loop if any previously-written xrefs. prev = trailer.Prev if prev is None: token = source.next() if token != 'startxref': source.warning( 'Expected "startxref" at end of xref table') break if not xref_list: trailer.Prev = None original_trailer = trailer source.floc = int(prev) xref_list.append(source.obj_offsets) if xref_list: for update in reversed(xref_list): source.obj_offsets.update(update) trailer.update(original_trailer) if trailer.Version and \ float(trailer.Version) > float(self.version): self.version = trailer.Version trailer = PdfDict(Root=trailer.Root, Info=trailer.Info, ID=trailer.ID # TODO: add Encrypt when implemented ) self.update(trailer) #self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() finally: if disable_gc: gc.enable()
def _get_fullpage(contents, resources, mbox, bbox, rotation): ''' fullpage is easy. Just copy the contents, set up the resources, and let _cache_xobj handle the rest. ''' return PdfDict(contents, Resources=resources)
def _pdf_image(self, psem): with TemporaryDirectory(prefix="djpdf-") as temp_dir: # JBIG2Globals are only used in symbol mode # In symbol mode jbig2 writes output to files otherwise # it's written to stdout symbol_mode = self.jbig2_threshold != 1 images_with_shared_globals = [] if symbol_mode and SHARE_JBIG2_GLOBALS: # Find all Jbig2Images that share the same symbol directory for obj in self._factory._cache: if (isinstance(obj, Jbig2Image) and self.compression == obj.compression and self.jbig2_threshold == obj.jbig2_threshold): images_with_shared_globals.append(obj) else: # The symbol directory is not shared with other Jbig2Images images_with_shared_globals.append(self) # Promise all handled Jbig2Images the finished image image_futures = [] my_image_future = None for image in images_with_shared_globals: future = asyncio.Future() asyncio.ensure_future(image._cache.get(future)) image_futures.append(future) if image is self: my_image_future = future # All futures are in place, the lock can be released self._factory._cache_lock.release() self._cache_lock_acquired = False # Prepare everything in parallel @asyncio.coroutine def get_jbig2_images(psem): # Convert images with ImageMagick to bitonal png in parallel yield from asyncio.gather(*[ run_command_async([ CONVERT_CMD, "-alpha", "remove", "-alpha", "off", "-colorspace", "gray", "-threshold", "50%", path.abspath(image.filename), path.abspath(path.join(temp_dir, "input.%d.png" % i)) ], psem) for i, image in enumerate(images_with_shared_globals) ]) cmd = [JBIG2_CMD, "-p"] if symbol_mode: cmd.extend( ["-s", "-t", format_number(self.jbig2_threshold, 4)]) for i, _ in enumerate(images_with_shared_globals): cmd.append( path.abspath(path.join(temp_dir, "input.%d.png" % i))) jbig2_images = [] jbig2_globals = None if symbol_mode: yield from run_command_async(cmd, psem, cwd=temp_dir) jbig2_globals = PdfDict() jbig2_globals.indirect = True with open(path.join(temp_dir, "output.sym"), "rb") as f: jbig2_globals.stream = f.read().decode("latin-1") for i, _ in enumerate(images_with_shared_globals): with open(path.join(temp_dir, "output.%04d" % i), "rb") as f: jbig2_images.append(f.read()) else: jbig2_images.append((yield from run_command_async(cmd, psem, cwd=temp_dir))) return jbig2_images, jbig2_globals @asyncio.coroutine def get_image_mask(image, psem): if image._mask is None: return None return (yield from image._mask.pdf_image(psem)) ((jbig2_images, jbig2_globals), image_masks) = yield from asyncio.gather( get_jbig2_images(psem), asyncio.gather(*[ get_image_mask(image, psem) for image in images_with_shared_globals ])) for image, jbig2_image, image_mask, image_future in zip( images_with_shared_globals, jbig2_images, image_masks, image_futures): (width, height, xres, yres) = struct.unpack('>IIII', jbig2_image[11:27]) pdf_image = PdfDict() pdf_image.indirect = True pdf_image.Type = PdfName.XObject pdf_image.Subtype = PdfName.Image pdf_image.Width = width pdf_image.Height = height if image._image_mask: pdf_image.ImageMask = PdfBool(True) else: # NOTE: DefaultGray color space is required for PDF/A pdf_image.ColorSpace = PdfName.DeviceGray if image_mask is not None: pdf_image.Mask = image_mask pdf_image.BitsPerComponent = 1 pdf_image.Filter = [PdfName.JBIG2Decode] if symbol_mode: pdf_image.DecodeParms = [{ PdfName.JBIG2Globals: jbig2_globals }] pdf_image.stream = jbig2_image.decode("latin-1") image_future.set_result(pdf_image) return my_image_future.result()