Example #1
0
def wrap_object(obj, width, margin):
    ''' Wrap an xobj in its own page object.
    '''
    fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
    contents = PdfDict(indirect=True)
    subtype = obj.Subtype
    if subtype == PdfName.Form:
        contents._stream = obj.stream
        contents.Length = obj.Length
        contents.Filter = obj.Filter
        contents.DecodeParms = obj.DecodeParms
        resources = obj.Resources
        mbox = obj.BBox
    elif subtype == PdfName.Image:  # Image
        xoffset = margin[0]
        yoffset = margin[1]
        cw = width - margin[0] - margin[2]
        iw, ih = float(obj.Width), float(obj.Height)
        ch = 1.0 * cw / iw * ih
        height = ch + margin[1] + margin[3]
        p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset))
        contents.stream = fmt % p
        resources = PdfDict(XObject=PdfDict(MyImage=obj))
        mbox = PdfArray((0, 0, width, height))
    else:
        raise TypeError("Expected Form or Image XObject")

    return PdfDict(
        indirect=True,
        Type=PdfName.Page,
        MediaBox=mbox,
        Resources=resources,
        Contents=contents,
        )
Example #2
0
def _get_subpage(contents, resources, mbox, bbox, rotation):
    ''' subpages *could* be as easy as full pages, but we
        choose to complicate life by creating a Form XObject
        for the page, and then one that references it for
        the subpage, on the off-chance that we want multiple
        items from the page.
    '''
    return PdfDict(
        stream='/FullPage Do\n',
        Resources=PdfDict(XObject=PdfDict(
            FullPage=_cache_xobj(contents, resources, mbox, mbox, 0))))
Example #3
0
    def readdict(self, source, PdfDict=PdfDict):
        ''' Found a << token.  Parse the tokens after that.
        '''
        specialget = self.special.get
        result = PdfDict()
        next = source.next

        tok = next()
        while tok != '>>':
            if not tok.startswith('/'):
                # Just skip the incorrect /name object.
                source.warning('Expected PDF /name object')
                tok = next()
                continue
            key = tok
            value = next()
            func = specialget(value)
            if func is not None:
                # Just keep working when bad token occurs.
                if func == self.badtoken:
                    tok = value
                    continue
                value = func(source)
                tok = next()
            else:
                tok = next()
                if value.isdigit() and tok.isdigit():
                    if next() != 'R':
                        source.exception('Expected "R" following two integers')
                    value = self.findindirect(value, tok)
                    tok = next()
            result[key] = value
        return result
Example #4
0
    def as_pdf_object(self, transform, page):
        """Return the PdfDict object representing the annotation, that will be
        inserted as is into the PDF document.

        :param list transform: Transformation matrix to transform the coords
            of the annotation from client-specified space to PDF user space.
        :param PdfDict page: The pdfrw page object from the PDF document
        :returns PdfDict: the annotation object to be inserted into the PDF
        """
        bounding_box = transform_rect(self.make_rect(), transform)
        appearance_stream = self._make_appearance_stream_dict(
            bounding_box,
            transform,
        )

        obj = PdfDict(
            Type=PdfName('Annot'),
            Subtype=PdfName(self.subtype),
            Rect=bounding_box,
            AP=appearance_stream,
        )
        if self._page_as_parent:
            obj.P = page

        for name in self._related:
            subobject = self._related[name].as_pdf_object(transform, page)
            setattr(obj, name, subobject)

        self._add_metadata(obj, self._metadata)
        self.add_additional_pdf_object_data(obj)
        obj.indirect = True

        return obj
Example #5
0
def _cache_xobj(contents, resources, mbox, bbox, rotation):
    ''' Return a cached Form XObject, or create a new one and cache it.
        Adds private members x, y, w, h
    '''
    cachedict = contents.xobj_cachedict
    if cachedict is None:
        cachedict = contents.private.xobj_cachedict = {}
    cachekey = mbox, bbox, rotation
    result = cachedict.get(cachekey)
    if result is None:
        func = (_get_fullpage, _get_subpage)[mbox != bbox]
        result = PdfDict(
            func(contents, resources, mbox, bbox, rotation),
            Type=PdfName.XObject,
            Subtype=PdfName.Form,
            FormType=1,
            BBox=PdfArray(bbox),
        )
        rect = bbox
        if rotation:
            matrix = rotate_point((1, 0), rotation) + \
                rotate_point((0, 1), rotation)
            result.Matrix = PdfArray(matrix + (0, 0))
            rect = rotate_rect(rect, rotation)

        result.private.x = rect[0]
        result.private.y = rect[1]
        result.private.w = rect[2] - rect[0]
        result.private.h = rect[3] - rect[1]
        cachedict[cachekey] = result
    return result
Example #6
0
    def readdict(self, source, PdfDict=PdfDict):
        ''' Found a << token.  Parse the tokens after that.
        '''
        specialget = self.special.get
        result = PdfDict()
        next = source.next

        tok = next()
        while tok != '>>':
            if not tok.startswith('/'):
                source.exception('Expected PDF /name object')
            key = tok
            value = next()
            func = specialget(value)
            if func is not None:
                value = func(source)
                tok = next()
            else:
                tok = next()
                if value.isdigit() and tok.isdigit():
                    if next() != 'R':
                        source.exception('Expected "R" following two integers')
                    value = self.findindirect(value, tok)
                    tok = next()
            result[key] = value
        return result
Example #7
0
    def _add_graphics_state_resources(resources, A):
        """Add in the resources dict for turning on transparency in the
        graphics state. For example, if both stroke and fill were transparent,
        this would add:
            << /ExtGState /PdfAnnotatorGS <<
                /CA 0.5 /ca 0.75 /Type /ExtGState
            >> >>
        to the Resources dict.

        Graphics states can also be specified externally, for use in explicit
        content streams. This is done by using the `graphics_states` property
        on the appearance object.
        """
        states = []
        internal_state = Annotation._get_internal_graphics_state(resources, A)
        if internal_state is not None:
            states.append((GRAPHICS_STATE_NAME, internal_state))

        if A.graphics_states:
            for name, state in A.graphics_states.items():
                states.append((name, state.as_pdf_dict()))

        if states:
            resources.ExtGState = PdfDict()
            for name, state in states:
                resources.ExtGState[PdfName(name)] = state
Example #8
0
def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
    ''' pagexobj creates and returns a Form XObject for
        a given view within a page (Defaults to entire page.)
    '''
    inheritable = page.inheritable
    resources = inheritable.Resources
    rotation = get_rotation(inheritable.Rotate)
    mbox, bbox = getrects(inheritable, viewinfo, rotation)
    rotation += get_rotation(viewinfo.rotate)
    if isinstance(page.Contents, PdfArray):
        if len(page.Contents) == 1:
            contents = page.Contents[0]
        else:
            # decompress and join multiple streams
            contlist = [c for c in page.Contents]
            uncompress(contlist)
            stream = '\n'.join([c.stream for c in contlist])
            contents = PdfDict(stream=stream)
    else:
        contents = page.Contents
    # Make sure the only attribute is length
    # All the filters must have been executed
    assert int(contents.Length) == len(contents.stream)
    if not allow_compressed:
        assert len([x for x in contents.iteritems()]) == 1
    return _cache_xobj(contents, resources, mbox, bbox, rotation)
Example #9
0
def uncompress(mylist,
               leave_raw=False,
               warnings=set(),
               flate=PdfName.FlateDecode,
               decompress=decompressobj,
               isinstance=isinstance,
               list=list,
               len=len):
    ok = True
    for obj in streamobjects(mylist):
        ftype = obj.Filter
        if ftype is None:
            continue
        if isinstance(ftype, list) and len(ftype) == 1:
            # todo: multiple filters
            ftype = ftype[0]
        parms = obj.DecodeParms or obj.DP
        if ftype != flate:
            msg = ('Not decompressing: cannot use filter %s'
                   ' with parameters %s') % (repr(ftype), repr(parms))
            if msg not in warnings:
                warnings.add(msg)
                log.warning(msg)
            ok = False
        else:
            dco = decompress()
            try:
                data = dco.decompress(convert_store(obj.stream))
            except Exception as s:
                error = str(s)
            else:
                error = None
                if isinstance(parms, PdfArray):
                    oldparms = parms
                    parms = PdfDict()
                    for x in oldparms:
                        parms.update(x)
                if parms:
                    predictor = int(parms.Predictor or 1)
                    columns = int(parms.Columns or 1)
                    colors = int(parms.Colors or 1)
                    bpc = int(parms.BitsPerComponent or 8)
                    if 10 <= predictor <= 15:
                        data, error = flate_png(data, predictor, columns,
                                                colors, bpc)
                    elif predictor != 1:
                        error = ('Unsupported flatedecode predictor %s' %
                                 repr(predictor))
            if error is None:
                assert not dco.unconsumed_tail
                if dco.unused_data.strip():
                    error = ('Unconsumed compression data: %s' %
                             repr(dco.unused_data[:20]))
            if error is None:
                obj.Filter = None
                obj.stream = data if leave_raw else convert_load(data)
            else:
                log.error('%s %s' % (error, repr(obj.indirect)))
                ok = False
    return ok
Example #10
0
 def _add_xobject_resources(resources, A):
     """Adds in provided, explicit XObjects into the appearance stream's
     Resources dict. This is used when the user is explicitly specifying the
     appearance stream and they want to include, say, an image.
     """
     if A.xobjects:
         resources.XObject = PdfDict()
         for xobject_name, xobject in A.xobjects.items():
             resources.XObject[PdfName(xobject_name)] = xobject
Example #11
0
def _make_border_dict(width, style, dash_array=None):
    border = PdfDict(
        Type=PdfName('Border'),
        W=width,
        S=PdfName(style),
    )
    if dash_array:
        if style != 'D':
            raise ValueError('Dash array only applies to dashed borders!')
        border.D = dash_array
    return border
Example #12
0
    def _make_ap_resources(self):
        """Make the Resources entry for the appearance stream dictionary.

        Implement add_additional_resources to add additional entries -
        fonts, XObjects, graphics state - to the Resources dictionary.
        """
        resources = PdfDict(ProcSet=PdfName('PDF'))
        self._add_graphics_state_resources(resources, self._appearance)
        self._add_xobject_resources(resources, self._appearance)
        self._add_font_resources(resources, self._appearance)
        self.add_additional_resources(resources)
        return resources
Example #13
0
    def _make_appearance_stream_dict(self, bounding_box, transform):
        resources = self._make_ap_resources()

        # Either use user-specified content stream or generate content stream
        # based on annotation type.
        stream = self._appearance.appearance_stream
        if stream is None:
            stream = self.make_appearance_stream()

        # Transform the appearance stream into PDF space and turn it into a str
        appearance_stream = stream.transform(transform).resolve()

        normal_appearance = PdfDict(
            stream=appearance_stream,
            BBox=bounding_box,
            Resources=resources,
            Matrix=translate(-bounding_box[0], -bounding_box[1]),
            Type=PdfName('XObject'),
            Subtype=PdfName('Form'),
            FormType=1,
        )
        return PdfDict(N=normal_appearance)
Example #14
0
def pdfr2l(a_src, a_dest):
    """
    pdfr2l
    """
    # --------------------------------------------------------------------------
    src = PdfReader(a_src)
    dest = PdfWriter()

    # dest.addpages(src.pages)

    dest.trailer = src

    # print dest.trailer.Root

    if dest.trailer.Root.ViewerPreferences:
        dest.trailer.Root.ViewerPreferences = PdfDict(Direction=PdfName.R2L)
    else:
        dest.trailer.Root.ViewerPreferences = PdfDict()
        dest.trailer.Root.ViewerPreferences.Direction = PdfName.R2L

    dest.trailer.Root.PageLayout = PdfName.TwoColumnRight

    dest.write(a_dest)
    def _get_trailer(self):
        trailer = self._trailer
        if trailer is not None:
            return trailer

        # Create the basic object structure of the PDF file
        trailer = PdfDict(Root=IndirectPdfDict(
            Type=PdfName.Catalog,
            Pages=IndirectPdfDict(Type=PdfName.Pages,
                                  Count=PdfObject(len(self.pagearray)),
                                  Kids=self.pagearray)))
        # Make all the pages point back to the page dictionary and
        # ensure they are indirect references
        pagedict = trailer.Root.Pages
        for page in pagedict.Kids:
            page.Parent = pagedict
        self._trailer = trailer
        return trailer
Example #16
0
def _build_cache(contents, allow_compressed):
    ''' Build a new dictionary holding the stream,
        and save it along with private cache info.
        Assumes validity has been pre-checked if
        we have a non-None xobj_copy.
    '''
    try:
        xobj_copy = contents.xobj_copy
    except AttributeError:
        # Should have a PdfArray here...
        array = contents
        private = contents
    else:
        # Should have a PdfDict here -- might or might not have cache copy
        if xobj_copy is not None:
            return xobj_copy
        array = [contents]
        private = contents.private

    # The spec says nothing about nested arrays.  Will
    # assume that's not a problem until we encounter them...

    xobj_copy = PdfDict(array[0])
    xobj_copy.private.xobj_cachedict = {}
    private.xobj_copy = xobj_copy

    if len(array) > 1:
        newstream = '\n'.join(x.stream for x in array)
        newlength = sum(int(x.Length) for x in array) + len(array) - 1
        assert newlength == len(newstream)
        xobj_copy.stream = newstream

        # Cannot currently cope with different kinds of
        # compression in the array, so just disallow it.
        allow_compressed = False

    if not allow_compressed:
        # Make sure there are no compression parameters
        for cdict in array:
            assert len([x for x in cdict.iteritems()]) == 1

            cachedict = contents.cachedict = {}
    return xobj_copy
Example #17
0
 def get_jbig2_images(psem):
     # Convert images with ImageMagick to bitonal png in parallel
     yield from asyncio.gather(*[
         run_command_async([
             CONVERT_CMD, "-alpha", "remove", "-alpha", "off",
             "-colorspace", "gray", "-threshold", "50%",
             path.abspath(image.filename),
             path.abspath(path.join(temp_dir, "input.%d.png" % i))
         ], psem)
         for i, image in enumerate(images_with_shared_globals)
     ])
     cmd = [JBIG2_CMD, "-p"]
     if symbol_mode:
         cmd.extend(
             ["-s", "-t",
              format_number(self.jbig2_threshold, 4)])
     for i, _ in enumerate(images_with_shared_globals):
         cmd.append(
             path.abspath(path.join(temp_dir, "input.%d.png" % i)))
     jbig2_images = []
     jbig2_globals = None
     if symbol_mode:
         yield from run_command_async(cmd, psem, cwd=temp_dir)
         jbig2_globals = PdfDict()
         jbig2_globals.indirect = True
         with open(path.join(temp_dir, "output.sym"), "rb") as f:
             jbig2_globals.stream = f.read().decode("latin-1")
         for i, _ in enumerate(images_with_shared_globals):
             with open(path.join(temp_dir, "output.%04d" % i),
                       "rb") as f:
                 jbig2_images.append(f.read())
     else:
         jbig2_images.append((yield from
                              run_command_async(cmd,
                                                psem,
                                                cwd=temp_dir)))
     return jbig2_images, jbig2_globals
Example #18
0
    def _build_font():
        with open(FONT_FILENAME, "rb") as f:
            embedded_font_stream = f.read()
        embedded_font = PdfDict()
        embedded_font.indirect = True
        embedded_font.Filter = [PdfName.FlateDecode]
        embedded_font.stream = zlib.compress(embedded_font_stream,
                                             9).decode("latin-1")
        embedded_font.Length1 = len(embedded_font_stream)

        font_descriptor = PdfDict()
        font_descriptor.indirect = True
        font_descriptor.Ascent = 1000
        font_descriptor.CapHeight = 1000
        font_descriptor.Descent = -1
        font_descriptor.Flags = 5  # FixedPitch + Symbolic
        font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500])
        font_descriptor.FontFile2 = embedded_font
        font_descriptor.FontName = PdfName.GlyphLessFont
        font_descriptor.ItalicAngle = 0
        font_descriptor.StemV = 80
        font_descriptor.Type = PdfName.FontDescriptor

        # Map everything to glyph 1
        cid_to_gid_map_stream = b"\0\1" * (1 << 16)
        cid_to_gid_map = PdfDict()
        cid_to_gid_map.indirect = True
        cid_to_gid_map.Filter = [PdfName.FlateDecode]
        cid_to_gid_map.stream = zlib.compress(cid_to_gid_map_stream,
                                              9).decode("latin-1")
        cid_to_gid_map.Length1 = len(cid_to_gid_map_stream)

        cid_system_info = PdfDict()
        cid_system_info.Ordering = PdfString.from_unicode("Identity")
        cid_system_info.Registry = PdfString.from_unicode("Adobe")
        cid_system_info.Supplement = 0

        cid_font = PdfDict()
        cid_font.indirect = True
        cid_font.CIDToGIDMap = cid_to_gid_map
        cid_font.BaseFont = PdfName.GlyphLessFont
        cid_font.CIDSystemInfo = cid_system_info
        cid_font.FontDescriptor = font_descriptor
        cid_font.Subtype = PdfName.CIDFontType2
        cid_font.Type = PdfName.Font
        cid_font.DW = 500

        with open(UNICODE_CMAP_FILENAME, "rb") as f:
            unicode_cmap_stream = f.read()
        unicode_cmap = PdfDict()
        unicode_cmap.indirect = True
        unicode_cmap.Filter = [PdfName.FlateDecode]
        unicode_cmap.stream = zlib.compress(unicode_cmap_stream,
                                            9).decode("latin-1")

        font = PdfDict()
        font.indirect = True
        font.BaseFont = PdfName.GlyphLessFont
        font.DescendantFonts = PdfArray([cid_font])
        font.Encoding = PdfName("Identity-H")
        font.Subtype = PdfName.Type0
        font.ToUnicode = unicode_cmap
        font.Type = PdfName.Font

        return font
Example #19
0
    def write_async(self, outfile, process_semaphore, progress_cb=None):
        pdf_writer = PdfWriter(version="1.5")

        pdf_group = PdfDict()
        pdf_group.indirect = True
        pdf_group.CS = PdfName.DeviceRGB
        pdf_group.I = PdfBool(True)
        pdf_group.S = PdfName.Transparency

        pdf_font_mapping = PdfDict()
        pdf_font_mapping.indirect = True
        pdf_font_mapping.F1 = self._build_font()

        for _ in self._pages:
            pdf_page = PdfDict()
            pdf_page.Type = PdfName.Page
            pdf_writer.addpage(pdf_page)
        # pdfrw makes a internal copy of the pages
        # use the copy so that references to pages in links are correct
        pdf_pages = list(pdf_writer.pagearray)

        srgb_colorspace = PdfDict()
        srgb_colorspace.indirect = True
        srgb_colorspace.N = 3  # Number of components (red, green, blue)
        with open(SRGB_ICC_FILENAME, "rb") as f:
            srgb_colorspace_stream = f.read()
        srgb_colorspace.Filter = [PdfName.FlateDecode]
        srgb_colorspace.stream = zlib.compress(srgb_colorspace_stream,
                                               9).decode("latin-1")
        srgb_colorspace.Length1 = len(srgb_colorspace_stream)
        default_rgb_colorspace = PdfArray([PdfName.ICCBased, srgb_colorspace])
        default_rgb_colorspace.indirect = True

        # Handle all pages in parallel
        @asyncio.coroutine
        def make_page(page, pdf_page, psem):
            # Prepare everything in parallel
            @asyncio.coroutine
            def get_pdf_thumbnail(psem):
                if page.thumbnail is None:
                    return None
                return (yield from page.thumbnail.pdf_thumbnail(psem))

            @asyncio.coroutine
            def get_pdf_background(psem):
                if page.background is None:
                    return None
                return (yield from page.background.pdf_image(psem))

            @asyncio.coroutine
            def get_pdf_mask(foreground, psem):
                if foreground.color is not None:
                    return None
                return (yield from foreground.pdf_mask(psem))

            pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = (
                yield from asyncio.gather(
                    get_pdf_thumbnail(psem), get_pdf_background(psem),
                    asyncio.gather(
                        *[fg.pdf_image(psem) for fg in page.foreground]),
                    asyncio.gather(
                        *[get_pdf_mask(fg, psem) for fg in page.foreground])))
            pdf_page.MediaBox = PdfArray(
                [0, 0, PdfNumber(page.width),
                 PdfNumber(page.height)])
            pdf_page.Group = pdf_group
            pdf_resources = PdfDict()
            pdf_colorspace = PdfDict()
            pdf_colorspace.DefaultRGB = default_rgb_colorspace
            pdf_resources.ColorSpace = pdf_colorspace
            pdf_xobject = PdfDict()
            if pdf_thumbnail is not None:
                pdf_page.Thumb = pdf_thumbnail
            im_index = 0
            # Save graphics state and scale unity rectangle to page size
            matrix = TransformationMatrix()
            matrix.scale(page.width, page.height)
            before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf())
            after_graphics = "\nQ\n"
            contents = ""
            graphics = ""
            current_color = None
            if page.color != self._factory.WHITE:
                if current_color != page.color:
                    current_color = page.color
                    graphics += page.color.to_pdf() + " rg "
                graphics += ("0 0 1 1 re " + "f\n")

            if pdf_background is not None:
                pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background
                graphics += "/Im%d Do\n" % im_index
                im_index += 1
            for foreground, pdf_foreground, pdf_mask in zip(
                    page.foreground, pdf_foregrounds, pdf_masks):
                if pdf_mask is not None:
                    pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask
                    im_index += 1
                pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground
                if (foreground.color is not None
                        and current_color != foreground.color):
                    current_color = foreground.color
                    graphics += foreground.color.to_pdf() + " rg "
                graphics += "/Im%d Do\n" % im_index
                im_index += 1
            if graphics:
                contents += (before_graphics + graphics.rstrip(" \n") +
                             after_graphics)
            current_color = None
            before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n")
            after_text = "\nET\n"
            text = ""
            pdf_annots = []
            for t in page.text:
                if t.text:
                    matrix = TransformationMatrix()
                    # Glyph size is 0.5 x 1
                    matrix.scale(2 / len(t.text), 1)
                    matrix.translate(-0.5, -0.5)
                    if t.direction == "ltr":
                        pass
                    elif t.direction == "rtl":
                        matrix.translate(0, -1)
                    elif t.direction == "ttb":
                        matrix.rotate(90)
                    matrix.rotate(-t.rotation)
                    matrix.translate(0.5, 0.5)
                    matrix.scale(t.width, t.height)
                    matrix.translate(t.x, t.y)
                    text += "%s Tm %s Tj\n" % (
                        matrix.to_pdf(), PdfString().from_bytes(
                            t.text.encode("utf-16-be"), bytes_encoding="hex"))
                if t.external_link is not None or t.internal_link is not None:
                    pdf_annot = PdfDict()
                    pdf_annots.append(pdf_annot)
                    pdf_annot.Type = PdfName.Annot
                    pdf_annot.Subtype = PdfName.Link
                    pdf_annot.Border = [0, 0, 0]
                    pdf_annot.Rect = [
                        PdfNumber(t.x),
                        PdfNumber(t.y),
                        PdfNumber(t.x + t.width),
                        PdfNumber(t.y + t.height)
                    ]
                    if t.external_link is not None:
                        pdf_a = PdfDict()
                        pdf_annot.A = pdf_a
                        pdf_a.Type = PdfName.Action
                        pdf_a.S = PdfName.URI
                        pdf_a.URI = t.external_link.decode("latin-1")
                    if t.internal_link is not None:
                        pdf_target_page = pdf_pages[t.internal_link[0]]
                        target_x, target_y = t.internal_link[1]
                        pdf_annot.Dest = [
                            pdf_target_page, PdfName.XYZ,
                            PdfNumber(target_x),
                            PdfNumber(target_y), 0
                        ]
            text = text.rstrip(" \n")
            if text:
                pdf_resources.Font = pdf_font_mapping
                contents += (before_text + text + after_text)
            contents = contents.rstrip(" \n")
            if contents:
                pdf_contents = PdfDict()
                pdf_contents.indirect = True
                pdf_page.Contents = pdf_contents
                if COMPRESS_PAGE_CONTENTS:
                    pdf_contents.Filter = [PdfName.FlateDecode]
                    pdf_contents.stream = zlib.compress(
                        contents.encode("latin-1"), 9).decode("latin-1")
                else:
                    pdf_contents.stream = contents
            if pdf_annots:
                pdf_page.Annots = pdf_annots
            if pdf_xobject:
                pdf_resources.XObject = pdf_xobject
            if pdf_resources:
                pdf_page.Resources = pdf_resources
            # Report progress
            nonlocal finished_pages
            finished_pages += 1
            if progress_cb:
                progress_cb(finished_pages / len(self._pages))

        finished_pages = 0
        yield from asyncio.gather(*[
            make_page(page, pdf_page, process_semaphore)
            for page, pdf_page in zip(self._pages, pdf_pages)
        ])

        trailer = pdf_writer.trailer

        document_id = PdfString().from_bytes(os.urandom(16))
        trailer.ID = [document_id, document_id]

        mark_info = PdfDict()
        mark_info.Marked = PdfBool(True)
        trailer.Root.MarkInfo = mark_info

        struct_tree_root = PdfDict()
        struct_tree_root.Type = PdfName.StructTreeRoot
        trailer.Root.StructTreeRoot = struct_tree_root

        metadata = PdfDict()
        metadata.indirect = True
        metadata.Type = PdfName.Metadata
        metadata.Subtype = PdfName.XML
        xmp = XMPMeta()
        xmp.set_property(XMP_NS_PDFA_ID, "part", "2")
        xmp.set_property(XMP_NS_PDFA_ID, "conformance", "A")
        metadata_stream = xmp.serialize_to_str().encode("utf-8")
        metadata.Filter = [PdfName.FlateDecode]
        metadata.stream = zlib.compress(metadata_stream, 9).decode("latin-1")
        metadata.Length1 = len(metadata_stream)
        trailer.Root.Metadata = metadata

        with TemporaryDirectory(prefix="djpdf-") as temp_dir:
            pdf_writer.write(path.join(temp_dir, "temp.pdf"))
            cmd = [
                QPDF_CMD, "--stream-data=preserve",
                "--object-streams=preserve", "--normalize-content=n",
                "--newline-before-endstream"
            ]
            if LINEARIZE_PDF:
                cmd.extend(["--linearize"])
            cmd.extend([
                path.abspath(path.join(temp_dir, "temp.pdf")),
                path.abspath(outfile)
            ])
            yield from run_command_async(cmd, process_semaphore)
Example #20
0
 def _add_font_resources(resources, A):
     if A.fonts:
         resources.Font = PdfDict()
         for font_name, font in A.fonts.items():
             resources.Font[PdfName(font_name)] = font
Example #21
0
        def make_page(page, pdf_page, psem):
            # Prepare everything in parallel
            @asyncio.coroutine
            def get_pdf_thumbnail(psem):
                if page.thumbnail is None:
                    return None
                return (yield from page.thumbnail.pdf_thumbnail(psem))

            @asyncio.coroutine
            def get_pdf_background(psem):
                if page.background is None:
                    return None
                return (yield from page.background.pdf_image(psem))

            @asyncio.coroutine
            def get_pdf_mask(foreground, psem):
                if foreground.color is not None:
                    return None
                return (yield from foreground.pdf_mask(psem))

            pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = (
                yield from asyncio.gather(
                    get_pdf_thumbnail(psem), get_pdf_background(psem),
                    asyncio.gather(
                        *[fg.pdf_image(psem) for fg in page.foreground]),
                    asyncio.gather(
                        *[get_pdf_mask(fg, psem) for fg in page.foreground])))
            pdf_page.MediaBox = PdfArray(
                [0, 0, PdfNumber(page.width),
                 PdfNumber(page.height)])
            pdf_page.Group = pdf_group
            pdf_resources = PdfDict()
            pdf_colorspace = PdfDict()
            pdf_colorspace.DefaultRGB = default_rgb_colorspace
            pdf_resources.ColorSpace = pdf_colorspace
            pdf_xobject = PdfDict()
            if pdf_thumbnail is not None:
                pdf_page.Thumb = pdf_thumbnail
            im_index = 0
            # Save graphics state and scale unity rectangle to page size
            matrix = TransformationMatrix()
            matrix.scale(page.width, page.height)
            before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf())
            after_graphics = "\nQ\n"
            contents = ""
            graphics = ""
            current_color = None
            if page.color != self._factory.WHITE:
                if current_color != page.color:
                    current_color = page.color
                    graphics += page.color.to_pdf() + " rg "
                graphics += ("0 0 1 1 re " + "f\n")

            if pdf_background is not None:
                pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background
                graphics += "/Im%d Do\n" % im_index
                im_index += 1
            for foreground, pdf_foreground, pdf_mask in zip(
                    page.foreground, pdf_foregrounds, pdf_masks):
                if pdf_mask is not None:
                    pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask
                    im_index += 1
                pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground
                if (foreground.color is not None
                        and current_color != foreground.color):
                    current_color = foreground.color
                    graphics += foreground.color.to_pdf() + " rg "
                graphics += "/Im%d Do\n" % im_index
                im_index += 1
            if graphics:
                contents += (before_graphics + graphics.rstrip(" \n") +
                             after_graphics)
            current_color = None
            before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n")
            after_text = "\nET\n"
            text = ""
            pdf_annots = []
            for t in page.text:
                if t.text:
                    matrix = TransformationMatrix()
                    # Glyph size is 0.5 x 1
                    matrix.scale(2 / len(t.text), 1)
                    matrix.translate(-0.5, -0.5)
                    if t.direction == "ltr":
                        pass
                    elif t.direction == "rtl":
                        matrix.translate(0, -1)
                    elif t.direction == "ttb":
                        matrix.rotate(90)
                    matrix.rotate(-t.rotation)
                    matrix.translate(0.5, 0.5)
                    matrix.scale(t.width, t.height)
                    matrix.translate(t.x, t.y)
                    text += "%s Tm %s Tj\n" % (
                        matrix.to_pdf(), PdfString().from_bytes(
                            t.text.encode("utf-16-be"), bytes_encoding="hex"))
                if t.external_link is not None or t.internal_link is not None:
                    pdf_annot = PdfDict()
                    pdf_annots.append(pdf_annot)
                    pdf_annot.Type = PdfName.Annot
                    pdf_annot.Subtype = PdfName.Link
                    pdf_annot.Border = [0, 0, 0]
                    pdf_annot.Rect = [
                        PdfNumber(t.x),
                        PdfNumber(t.y),
                        PdfNumber(t.x + t.width),
                        PdfNumber(t.y + t.height)
                    ]
                    if t.external_link is not None:
                        pdf_a = PdfDict()
                        pdf_annot.A = pdf_a
                        pdf_a.Type = PdfName.Action
                        pdf_a.S = PdfName.URI
                        pdf_a.URI = t.external_link.decode("latin-1")
                    if t.internal_link is not None:
                        pdf_target_page = pdf_pages[t.internal_link[0]]
                        target_x, target_y = t.internal_link[1]
                        pdf_annot.Dest = [
                            pdf_target_page, PdfName.XYZ,
                            PdfNumber(target_x),
                            PdfNumber(target_y), 0
                        ]
            text = text.rstrip(" \n")
            if text:
                pdf_resources.Font = pdf_font_mapping
                contents += (before_text + text + after_text)
            contents = contents.rstrip(" \n")
            if contents:
                pdf_contents = PdfDict()
                pdf_contents.indirect = True
                pdf_page.Contents = pdf_contents
                if COMPRESS_PAGE_CONTENTS:
                    pdf_contents.Filter = [PdfName.FlateDecode]
                    pdf_contents.stream = zlib.compress(
                        contents.encode("latin-1"), 9).decode("latin-1")
                else:
                    pdf_contents.stream = contents
            if pdf_annots:
                pdf_page.Annots = pdf_annots
            if pdf_xobject:
                pdf_resources.XObject = pdf_xobject
            if pdf_resources:
                pdf_page.Resources = pdf_resources
            # Report progress
            nonlocal finished_pages
            finished_pages += 1
            if progress_cb:
                progress_cb(finished_pages / len(self._pages))
Example #22
0
    def __init__(self,
                 fname=None,
                 fdata=None,
                 decompress=False,
                 disable_gc=True):

        # Runs a lot faster with GC off.
        disable_gc = disable_gc and gc.isenabled()
        try:
            if disable_gc:
                gc.disable()
            if fname is not None:
                assert fdata is None
                # Allow reading preexisting streams like pyPdf
                if hasattr(fname, 'read'):
                    fdata = fname.read()
                else:
                    try:
                        f = open(fname, 'rb')
                        fdata = f.read()
                        f.close()
                    except IOError:
                        raise PdfParseError('Could not read PDF file %s' %
                                            fname)

            assert fdata is not None
            if not fdata.startswith('%PDF-'):
                startloc = fdata.find('%PDF-')
                if startloc >= 0:
                    log.warning('PDF header not at beginning of file')
                else:
                    lines = fdata.lstrip().splitlines()
                    if not lines:
                        raise PdfParseError('Empty PDF file!')
                    raise PdfParseError('Invalid PDF header: %s' %
                                        repr(lines[0]))

            self.version = fdata[5:8]

            endloc = fdata.rfind('%EOF')
            if endloc < 0:
                raise PdfParseError('EOF mark not found: %s' %
                                    repr(fdata[-20:]))
            endloc += 6
            junk = fdata[endloc:]
            fdata = fdata[:endloc]
            if junk.rstrip('\00').strip():
                log.warning('Extra data at end of file')

            private = self.private
            private.indirect_objects = {}
            private.deferred_objects = set()
            private.special = {
                '<<': self.readdict,
                '[': self.readarray,
                'endobj': self.empty_obj,
            }
            for tok in r'\ ( ) < > { } ] >> %'.split():
                self.special[tok] = self.badtoken

            startloc, source = self.findxref(fdata)
            private.source = source
            xref_list = []
            source.all_offsets = []
            while 1:
                source.obj_offsets = {}

                # Loop through all the cross-reference tables/streams
                trailer = self.parsexref(source)

                # Loop if any previously-written xrefs.
                prev = trailer.Prev
                if prev is None:
                    token = source.next()
                    if token != 'startxref':
                        source.warning(
                            'Expected "startxref" at end of xref table')
                    break
                if not xref_list:
                    trailer.Prev = None
                    original_trailer = trailer
                source.floc = int(prev)
                xref_list.append(source.obj_offsets)

            if xref_list:
                for update in reversed(xref_list):
                    source.obj_offsets.update(update)
                trailer.update(original_trailer)

            if trailer.Version and \
                    float(trailer.Version) > float(self.version):
                self.version = trailer.Version

            trailer = PdfDict(Root=trailer.Root,
                              Info=trailer.Info,
                              ID=trailer.ID
                              # TODO: add Encrypt when implemented
                              )
            self.update(trailer)

            #self.read_all_indirect(source)
            private.pages = self.readpages(self.Root)
            if decompress:
                self.uncompress()
        finally:
            if disable_gc:
                gc.enable()
Example #23
0
def _get_fullpage(contents, resources, mbox, bbox, rotation):
    ''' fullpage is easy.  Just copy the contents,
        set up the resources, and let _cache_xobj handle the
        rest.
    '''
    return PdfDict(contents, Resources=resources)
Example #24
0
    def _pdf_image(self, psem):
        with TemporaryDirectory(prefix="djpdf-") as temp_dir:
            # JBIG2Globals are only used in symbol mode
            # In symbol mode jbig2 writes output to files otherwise
            # it's written to stdout
            symbol_mode = self.jbig2_threshold != 1
            images_with_shared_globals = []
            if symbol_mode and SHARE_JBIG2_GLOBALS:
                # Find all Jbig2Images that share the same symbol directory
                for obj in self._factory._cache:
                    if (isinstance(obj, Jbig2Image)
                            and self.compression == obj.compression
                            and self.jbig2_threshold == obj.jbig2_threshold):
                        images_with_shared_globals.append(obj)
            else:
                # The symbol directory is not shared with other Jbig2Images
                images_with_shared_globals.append(self)
            # Promise all handled Jbig2Images the finished image
            image_futures = []
            my_image_future = None
            for image in images_with_shared_globals:
                future = asyncio.Future()
                asyncio.ensure_future(image._cache.get(future))
                image_futures.append(future)
                if image is self:
                    my_image_future = future
            # All futures are in place, the lock can be released
            self._factory._cache_lock.release()
            self._cache_lock_acquired = False

            # Prepare everything in parallel
            @asyncio.coroutine
            def get_jbig2_images(psem):
                # Convert images with ImageMagick to bitonal png in parallel
                yield from asyncio.gather(*[
                    run_command_async([
                        CONVERT_CMD, "-alpha", "remove", "-alpha", "off",
                        "-colorspace", "gray", "-threshold", "50%",
                        path.abspath(image.filename),
                        path.abspath(path.join(temp_dir, "input.%d.png" % i))
                    ], psem)
                    for i, image in enumerate(images_with_shared_globals)
                ])
                cmd = [JBIG2_CMD, "-p"]
                if symbol_mode:
                    cmd.extend(
                        ["-s", "-t",
                         format_number(self.jbig2_threshold, 4)])
                for i, _ in enumerate(images_with_shared_globals):
                    cmd.append(
                        path.abspath(path.join(temp_dir, "input.%d.png" % i)))
                jbig2_images = []
                jbig2_globals = None
                if symbol_mode:
                    yield from run_command_async(cmd, psem, cwd=temp_dir)
                    jbig2_globals = PdfDict()
                    jbig2_globals.indirect = True
                    with open(path.join(temp_dir, "output.sym"), "rb") as f:
                        jbig2_globals.stream = f.read().decode("latin-1")
                    for i, _ in enumerate(images_with_shared_globals):
                        with open(path.join(temp_dir, "output.%04d" % i),
                                  "rb") as f:
                            jbig2_images.append(f.read())
                else:
                    jbig2_images.append((yield from
                                         run_command_async(cmd,
                                                           psem,
                                                           cwd=temp_dir)))
                return jbig2_images, jbig2_globals

            @asyncio.coroutine
            def get_image_mask(image, psem):
                if image._mask is None:
                    return None
                return (yield from image._mask.pdf_image(psem))

            ((jbig2_images, jbig2_globals),
             image_masks) = yield from asyncio.gather(
                 get_jbig2_images(psem),
                 asyncio.gather(*[
                     get_image_mask(image, psem)
                     for image in images_with_shared_globals
                 ]))

            for image, jbig2_image, image_mask, image_future in zip(
                    images_with_shared_globals, jbig2_images, image_masks,
                    image_futures):
                (width, height, xres,
                 yres) = struct.unpack('>IIII', jbig2_image[11:27])
                pdf_image = PdfDict()
                pdf_image.indirect = True
                pdf_image.Type = PdfName.XObject
                pdf_image.Subtype = PdfName.Image
                pdf_image.Width = width
                pdf_image.Height = height
                if image._image_mask:
                    pdf_image.ImageMask = PdfBool(True)
                else:
                    # NOTE: DefaultGray color space is required for PDF/A
                    pdf_image.ColorSpace = PdfName.DeviceGray
                if image_mask is not None:
                    pdf_image.Mask = image_mask
                pdf_image.BitsPerComponent = 1
                pdf_image.Filter = [PdfName.JBIG2Decode]
                if symbol_mode:
                    pdf_image.DecodeParms = [{
                        PdfName.JBIG2Globals:
                        jbig2_globals
                    }]
                pdf_image.stream = jbig2_image.decode("latin-1")
                image_future.set_result(pdf_image)
        return my_image_future.result()