def splitting(*varargs,filenameOut ="out"):

        if(len(varargs)<=1):
            raise IndexError("Errore: inserire almeno due file.")

        for file in varargs:
            if False == (isinstance(file,str)):
                raise ValueError("Errore: i file devono essere pdf")

        if False == (isinstance(filenameOut,str)):
                raise ValueError("Errore: il nome del file deve essere di tipo str")

        all = PdfWriter()
        numpage=float("inf")

        for file in varargs:
            reader = PdfReader(file)
            i=0
            for page in reader.pages:
                i=i+1
            if (numpage > i):
                 numpage=i

        for i in range(numpage):
            for filename in varargs:
                reader = PdfReader(filename)
                all.addPage(reader.getPage(i))
        if(filenameOut.endswith('.pdf') == False):
            filenameOut = filenameOut+'.pdf'

        all.write(filenameOut)
Example #2
0
def test_pdf(pdfname):
    outfn = os.path.join(outdir, hashlib.md5(pdfname).hexdigest() + '.pdf')
    trailer = PdfReader(pdfname, decompress=False)
    try:
        trailer.Info.OriginalFileName = pdfname
    except AttributeError:
        trailer.OriginalFileName = pdfname
Example #3
0
def splitting(filenameOut ="out",*varargs):

    for file in varargs:
        if False == (isinstance(file,str)):
            raise ValueError("Errore: i file devono essere pdf")

    if False == (isinstance(filenameOut,str)):
            raise ValueError("Errore: il nome del file deve essere di tipo str")


    all = PdfWriter()
    numpage=float("inf")

    for file in varargs:
        reader = PdfReader(file)
        i=0
        for page in reader.pages:
            i=i+1
        if (numpage > i):
             numpage=i

    for i in range(numpage):
        for filename in varargs:
            reader = PdfReader(filename)
            all.addPage(reader.getPage(i))


    all.write(filenameOut+".pdf")
Example #4
0
 def __init__(self, pdfFile, translator, ttfFile=TTF_FILE):
     """            
             pdfFile is the file name or file object of the pdf file we want to translate
             translator is a unicode to unicode function
             ttfFile is the default ttf font file name used in translated pdf
     """
     try:
         self.pdf=PdfReader(pdfFile, decompress=False)
     except:
         print "Using pdftk to uncompress and decrypt"
         from subprocess import Popen, PIPE            
         cmd = ['pdftk', pdfFile, 'output', '-',  'uncompress']
         proc = Popen(cmd,stdout=PIPE)
         cmdout,cmderr = proc.communicate()
         if cmderr:
             print "Unable to open", pdfFile
             sys.exit(1)
         self.pdf=PdfReader(StringIO.StringIO(cmdout), decompress=False)
     self.decodeDicts={}
     self.font_list=[]
     self.dttf=fontTools.ttLib.TTFont(ttfFile) # default ttf
     if self.dttf['cmap'].getcmap(3,1) is None:        
         cmap10=self.dttf['cmap'].getcmap(3,10).cmap
         cmap={k&0xffff:v for k,v in cmap10.iteritems()}
     else:
         cmap=self.dttf['cmap'].getcmap(3,1).cmap
     self.ttf_cmap=cmap
     for n, page in enumerate(self.pdf.pages):
         print "Translating p", n , "\r",
         self._translatePage(page, translator)
     # translate fonts
     fontfile = None
     for font in self.font_list:
         if fontfile is None:
             fontfile=font.DescendantFonts[0].FontDescriptor.FontFile2
             writeStream(fontfile, file(ttfFile,"rb").read())
         else:
             font.DescendantFonts[0].FontDescriptor.FontFile2 = fontfile
         font.ToUnicode=None
     # translate info
     if self.pdf.has_key("/Info"):
         for k,v in self.pdf.Info.iteritems():
             tv=transPdfString(v, translator)
             if tv is not None:
                 self.pdf.Info[k]=tv                    
     # translate outlines        
     if self.pdf.Root.Outlines:
         array=[]
         array.append(self.pdf.Root.Outlines.First)
         while array:
             x=array.pop()
             if x.First:
                 array.append(x.First)
             if x.Next:
                 array.append(x.Next)                                
             tTitle=transPdfString(x.Title, translator)
             if tTitle is not None:
                 x.Title=tTitle                
Example #5
0
def test_pdf(pdfname):
    outfn = os.path.join(outdir, hashlib.md5(pdfname).hexdigest() + '.pdf')
    print >> stderr, '             ->', outfn
    trailer = PdfReader(pdfname, decompress=False)
    try:
        trailer.Info.OriginalFileName = pdfname
    except AttributeError:
        trailer.OriginalFileName = pdfname
    writer = PdfWriter()
    writer.trailer = trailer
    writer.write(outfn)
def remove_metadata_text(infile, ext):
        if ext.lower() == "pdf":
                trailer = PdfReader(infile)
                trailer.Info = {}
                PdfWriter(infile, trailer=trailer).write()
        elif ext.lower() == "docx":
                meta_fields= ["author", "category", "comments", "content_status", "identifier", "keywords", "language", "subject", "title"]
                document = Document(infile)
                properties = document.core_properties
                for meta_field in meta_fields:
                        setattr(properties, meta_field, "")
                document.save("%s" % infile)
                document = Document("%s" % infile)
Example #7
0
def read_and_double(inpfn):
    pages = PdfReader(inpfn, decompress=False).pages
    pages = [pagexobj(x) for x in pages]
    if len(pages) & 1:
        pages.append(pages[0])  # Sentinel -- get same size for back as front

    xobjs = []
    while len(pages) > 2:
        xobjs.append((pages.pop(), pages.pop(0)))
        xobjs.append((pages.pop(0), pages.pop()))
    xobjs += [(x,) for x in pages]
    return xobjs
Example #8
0
def fingerprinter_upload(request):
    processed_files = []

    pdf_file = request.FILES.get('pdf-file')
    copy_count = request.POST.get('copy-count', 1)
    suffix = request.POST.get('file-suffix', '')

    try:
        copy_count = int(copy_count)
    except:
        copy_count = 1

    if pdf_file is not None:
        #make save directory
        rand_path = randomword(9)
        fingerprint_dir = os.path.join(settings.BASE_DIR, settings.STATIC_ROOT,
                                       'fingerprints', rand_path)

        os.makedirs(fingerprint_dir)

        s = os.path.splitext(pdf_file.name)
        filename = s[0]

        #handle non ascii chars in file name
        #(strangly only wsgi seems to choke on those)
        if isinstance(filename, unicode):
            try:
                filename = unidecode(filename)
            except:
                filename = re.sub(r'[^\x00-\x7F]+', '.', filename)

        extension = s[1]

        file_content = pdf_file.read()

        content = PdfReader(io.BytesIO(file_content))

        if content.ID is None:
            file_id = 'No ID'
        else:
            file_id = str(content.ID[0]).replace('<', '').replace('>', '')\
                    .replace('(', '').replace(')', '')

        #bad file_ids can contain strange characters
        #TODO When we upgrade
        try:
            file_id.encode('utf-8').strip()
        except UnicodeDecodeError:
            file_id = 'Unreadable'

        file_info = {
            'filename': pdf_file.name,
            'size': pdf_file.size,
            'id': file_id,
            'directory_name': rand_path
        }

        for copy_index in range(copy_count):
            if suffix and suffix != '':
                save_filename = filename + '-' + suffix + '-' + str(
                    copy_index + 1) + extension
            else:
                save_filename = filename + '-' + str(copy_index +
                                                     1) + extension

            print('AAA', save_filename)

            file_path = os.path.join(fingerprint_dir, save_filename)

            static_link = os.path.join('/pdf', save_filename)
            download_link = os.path.join('/static/drop-pdf', save_filename)

            content = PdfReader(io.BytesIO(file_content))

            #add some random meta data
            content.Info.randomMetaData = binascii.b2a_hex(
                os.urandom(20)).upper()

            #change id to random id
            md = hashlib.md5(filename)
            md.update(str(time.time()))
            md.update(os.urandom(10))

            new_id = md.hexdigest().upper()

            #keep length 32
            new_id = new_id[0:32]

            while len(new_id) < 32:
                new_id += random.choice('0123456789ABCDEF')

            content.ID = [new_id, new_id]

            PdfWriter(file_path, trailer=content).write()

            #copy file into online annotator with unique name
            annotation_name = filename + '-' + suffix + '-' \
                    + str(copy_index + 1) + '-' + rand_path + extension

            annotation_path = os.path.join(settings.BASE_DIR,
                                           settings.STATIC_ROOT, 'drop-pdf',
                                           annotation_name)

            shutil.copy(file_path, annotation_path)

            #For some reason nested directories do not provide files from static.
            #We need to clean up double "settings" file and sanify the basic setup but
            #For now serve the file from a dedicated URL.

            copy_info = {
                'filename': save_filename,
                'download_path': os.path.join(rand_path, save_filename),
                'docdrop_link': annotation_name,
                'id': content.ID[0]
            }

            processed_files.append(copy_info)

    else:
        raise Http404('file not provided')

    data = {
        'processed_files': processed_files,
        'file_info': file_info,
        'archive_name': filename
    }

    print(data)

    return render_to_response('refingerprint_results.html', data)
Example #9
0
def load_pdf():
    try:
        return PdfReader(BASE_PDF_PATH)
    except:
        fail_critical('Base PDF did not load')
Example #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Add ToUnicode tables to PDF files.')
    parser.add_argument('--outdir',
                        default='tmp/sfd',
                        type=str,
                        help='Output .sfd files to this directory')
    parser.add_argument('pdfs',
                        type=str,
                        nargs='+',
                        help='PDF files to process')
    args = parser.parse_args()

    fontnum = 0
    for pdf in args.pdfs:
        print("Adding ToUnicode tables to PDF file {}".format(pdf))
        with open(pdf, 'rb') as fobj:
            pdfdata = fobj.read()
        doc = PdfReader(fdata=pdfdata)
        doc.read_all()
        fonts = [
            o for o in doc.indirect_objects.values()
            if hasattr(o, 'Type') and o.Type == '/Font'
        ]
        fonts = {
            font.FontDescriptor.FontName[1:]: font
            for font in fonts if font.FontDescriptor is not None
        }
        embedded_fonts = fontforge.fontsInFile(pdf)
        for fontname in embedded_fonts:
            if fontname not in fonts:
                print(
                    "WARNING: font {} not found in pdf file".format(fontname))
                continue
            print("Adding ToUnicode table to font {}".format(fontname))
            font = fontforge.open('{}({})'.format(pdf, fontname))
            fonts[fontname].ToUnicode = PdfDict()
            fonts[fontname].ToUnicode.stream = generate_tounicode(
                font, fonts[fontname])
            # Need to save the modified font because fontforge won't read
            # ToUnicode when it converts to woff later.
            font.fontname = 'pretex{:06d}'.format(fontnum)
            font.save(
                os.path.join(
                    args.outdir,
                    '[{}]{}.sfd'.format(os.path.basename(pdf)[:-4], fontname)))
            fontnum += 1
        PdfWriter(pdf, trailer=doc).write()

        # Measure extents for displayed equations
        pdfpath = os.path.realpath(os.path.dirname(pdf))
        doc = poppler.document_new_from_file(
            'file://{}'.format(os.path.realpath(pdf)), None)
        boxsize = os.path.join(pdfpath, 'boxsize.txt')
        with open(boxsize) as fobj:
            lines = fobj.readlines()
        with open(boxsize, 'w') as fobj:
            pageno = 0
            for line in lines:
                if not (line.startswith('inline:')
                        or line.startswith('display:')):
                    fobj.write(line)
                    continue
                pageno += 1
                if not line.startswith('display:'):
                    fobj.write(line)
                    continue
                page = doc.get_page(pageno - 1)
                width, height = page.get_size()
                surf = cairo.RecordingSurface(
                    cairo.Content.COLOR_ALPHA,
                    cairo.Rectangle(0, 0, width, height))
                ctx = cairo.Context(surf)
                page.render_for_printing(ctx)
                x, y, w, h = surf.ink_extents()
                fobj.write(line.strip() + '{},{},{},{}\n'.format(x, y, w, h))
def manuals_extract_pages_pdfrw(status_dic, PDF_file_FN, img_dir_FN):
    # --- Load and parse PDF ---
    reader = PdfReader(PDF_file_FN.getPath())
    log_info('PDF file "{0}"'.format(PDF_file_FN.getPath()))
    log_info('PDF has {0} pages'.format(reader.numPages))

    # --- Iterate page by page ---
    image_counter = 0
    for page_index, page in enumerate(reader.pages):
        # --- Iterate /Resources in page ---
        # log_debug('###### Processing page {0} ######'.format(page_index))
        resource_dic = page['/Resources']
        for r_name, resource in resource_dic.iteritems():
            # >> Skip non /XObject keys in /Resources
            if r_name != '/XObject': continue

            # >> DEBUG dump /XObjects dictionary
            # print('--- resource ---')
            # pprint(resource)
            # print('----------------')

            # >> Traverse /XObject dictionary data. Each page may have 0, 1 or more /XObjects
            # >> If there is more than 1 image in the page there could be more than 1 /XObject.
            # >> Some /XObjects are not images, for example, /Subtype = /Form.
            # >> NOTE Also, images may be inside the /Resources of a /From /XObject.
            img_index = 0
            for xobj_name, xobj_dic in resource.iteritems():
                xobj_type = xobj_dic['/Type']
                xobj_subtype = xobj_dic['/Subtype']
                # >> Skip XObject forms
                if xobj_subtype == '/Form':
                    # >> NOTE There could be an image /XObject in the /From : /Resources dictionary.
                    log_info('Skipping /Form /XObject')
                    log_info('--- xobj_dic ---')
                    log_info(pprint.pformat(xobj_dic))
                    log_info('----------------')
                    continue

                # --- Print info ---
                log_debug('------ Page {0:02d} Image {1:02d} ------'.format(page_index, img_index))
                log_debug('xobj_name     {0}'.format(xobj_name))
                log_debug('xobj_type     {0}'.format(xobj_type))
                log_debug('xobj_subtype  {0}'.format(xobj_subtype))
                # log_debug('--- xobj_dic ---')
                # log_debug(pprint.pformat(xobj_dic))
                # log_debug('----------------')

                # --- Extract image ---
                # Returns a PIL image object or None
                img = _extract_image_from_XObject(xobj_dic)

                # --- Save image ---
                if img:
                    img_basename_str = 'Image_page{0:02d}_img{1:02d}.png'.format(i, img_index)
                    img_path_str = img_dir_FN.pjoin(img_basename_str).getPath()
                    log_debug('Saving IMG "{0}"'.format(img_path_str))
                    img.save(img_path_str, 'PNG')
                    image_counter += 1
                    img_index += 1
                else:
                    log_warning('Error extracting image from /XObject')
    log_info('Extracted {0} images from PDF'.format(image_counter))

    # --- Initialise status_dic ---
    status_dic['manFormat'] = 'PDF'
    status_dic['numImages'] = image_counter
Example #12
0
def pdf_page_count(filepath):
    """
    Return the number of pages.
    """
    return len(PdfReader(filepath).pages)
Example #13
0
So she did an 8.5x11" output with 0.5" margin all around
(actual size of useful area 7.5x10") and we scaled it
up by 4.8.

We also copy the Info dict to the new PDF.

'''

import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict


def adjust(page, margin=36, scale=4.8):
    info = PageMerge().add(page)
    x1, y1, x2, y2 = info.xobj_box
    viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin)
    page = PageMerge().add(page, viewrect=viewrect)
    page[0].scale(scale)
    return page.render()


inpfn, = sys.argv[1:]
outfn = 'poster.' + os.path.basename(inpfn)
reader = PdfReader(inpfn)
writer = PdfWriter(outfn)
writer.addpage(adjust(reader.pages[0]))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
writer.write()
Example #14
0
def test_bookmarks():
    """Test the structure of the document bookmarks."""
    pdf_bytes = FakeHTML(string='''
        <h1>a</h1>  #
        <h4>b</h4>  ####
        <h3>c</h3>  ###
        <h2>d</h2>  ##
        <h1>e</h1>  #
    ''').write_pdf()
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    # a
    # |_ b
    # |_ c
    # L_ d
    # e
    assert outlines.Count == '5'
    assert outlines.First.Title == '(a)'
    assert outlines.First.First.Title == '(b)'
    assert outlines.First.First.Next.Title == '(c)'
    assert outlines.First.First.Next.Next.Title == '(d)'
    assert outlines.First.Last.Title == '(d)'
    assert outlines.First.Next.Title == '(e)'
    assert outlines.Last.Title == '(e)'

    pdf_bytes = FakeHTML(string='<body>').write_pdf()
    assert PdfReader(fdata=pdf_bytes).Root.Outlines is None

    pdf_bytes = FakeHTML(string='<h1>a nbsp…</h1>').write_pdf()
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    assert outlines.First.Title.decode() == 'a nbsp…'

    pdf_bytes = FakeHTML(string='''
        <style>
            * { height: 90pt; margin: 0 0 10pt 0 }
        </style>
        <h1>Title 1</h1>
        <h1>Title 2</h1>
        <h2 style="position: relative; left: 20pt">Title 3</h2>
        <h2>Title 4</h2>
        <h3>Title 5</h3>
        <span style="display: block; page-break-before: always"></span>
        <h2>Title 6</h2>
        <h1>Title 7</h1>
        <h2>Title 8</h2>
        <h3>Title 9</h3>
        <h1>Title 10</h1>
        <h2>Title 11</h2>
    ''').write_pdf()
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    # 1
    # 2
    # |_ 3
    # |_ 4
    # |  L_ 5
    # L_ 6
    # 7
    # L_ 8
    #    L_ 9
    # 10
    # L_ 11
    assert outlines.Count == '11'
    assert outlines.First.Title == '(Title 1)'
    assert outlines.First.Next.Title == '(Title 2)'
    assert outlines.First.Next.Count == '5'
    assert outlines.First.Next.First.Title == '(Title 3)'
    assert outlines.First.Next.First.Parent.Title == '(Title 2)'
    assert outlines.First.Next.First.Next.Title == '(Title 4)'
    assert outlines.First.Next.First.Next.Count == '2'
    assert outlines.First.Next.First.Next.First.Title == '(Title 5)'
    assert outlines.First.Next.First.Next.Last.Title == '(Title 5)'
    assert outlines.First.Next.First.Next.Next.Title == '(Title 6)'
    assert outlines.First.Next.Last.Title == '(Title 6)'
    assert outlines.First.Next.Next.Title == '(Title 7)'
    assert outlines.First.Next.Next.Count == '3'
    assert outlines.First.Next.Next.First.Title == '(Title 8)'
    assert outlines.First.Next.Next.Last.Title == '(Title 8)'
    assert outlines.First.Next.Next.Last.Count == '2'
    assert outlines.First.Next.Next.First.First.Title == '(Title 9)'
    assert outlines.First.Next.Next.First.Last.Title == '(Title 9)'
    assert outlines.First.Next.Next.Next.Title == '(Title 10)'
    assert outlines.Last.Title == '(Title 10)'
    assert outlines.Last.First.Title == '(Title 11)'
    assert outlines.Last.Last.Title == '(Title 11)'

    pdf_bytes = FakeHTML(string='''
        <h2>1</h2> level 1
        <h4>2</h4> level 2
        <h2>3</h2> level 1
        <h3>4</h3> level 2
        <h4>5</h4> level 3
    ''').write_pdf()
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    # 1
    # L_ 2
    # 3
    # L_ 4
    #    L_ 5
    assert outlines.Count == '5'
    assert outlines.First.Title == '(1)'
    assert outlines.First.First.Title == '(2)'
    assert outlines.Last.Title == '(3)'
    assert outlines.Last.First.Title == '(4)'
    assert outlines.Last.First.First.Title == '(5)'

    pdf_bytes = FakeHTML(string='''
        <h2>1</h2> h2 level 1
        <h4>2</h4> h4 level 2
        <h3>3</h3> h3 level 2
        <h5>4</h5> h5 level 3
        <h1>5</h1> h1 level 1
        <h2>6</h2> h2 level 2
        <h2>7</h2> h2 level 2
        <h4>8</h4> h4 level 3
        <h1>9</h1> h1 level 1
    ''').write_pdf()
    # 1
    # |_ 2
    # L_ 3
    #    L_ 4
    # 5
    # |_ 6
    # L_ 7
    #    L_ 8
    # 9
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    assert outlines.Count == '9'
    assert outlines.First.Title == '(1)'
    assert outlines.First.First.Title == '(2)'
    assert outlines.First.First.Next.Title == '(3)'
    assert outlines.First.First.Next.First.Title == '(4)'
    assert outlines.First.Next.Title == '(5)'
    assert outlines.First.Next.First.Title == '(6)'
    assert outlines.First.Next.First.Next.Title == '(7)'
    assert outlines.First.Next.First.Next.First.Title == '(8)'
    assert outlines.Last.Title == '(9)'

    # Reference for the next test. zoom=1
    pdf_bytes = FakeHTML(string='<h2>a</h2>').write_pdf()
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    assert outlines.First.Title == '(a)'
    y = float(outlines.First.A.D[3])

    pdf_bytes = FakeHTML(string='<h2>a</h2>').write_pdf(zoom=1.5)
    outlines = PdfReader(fdata=pdf_bytes).Root.Outlines
    assert outlines.First.Title == '(a)'
    assert round(float(outlines.First.A.D[3])) == round(y * 1.5)
Example #15
0
    # get blank page on back.
    for p1, p2 in zip(pages, pages[1:]):
        if p1[1] is p2[1]:
            pages.remove(p1)

    return IndirectPdfDict(
        Type=PdfName.Page,
        Contents=PdfDict(stream=''.join(page.stream for page in pages)),
        MediaBox=PdfArray([0, 0, x, y]),
        Resources=PdfDict(
            XObject=PdfDict(pages),
        ),
    )

inpfn, = sys.argv[1:]
outfn = 'booklet.' + os.path.basename(inpfn)
pages = PdfReader(inpfn).pages

# Use page1 as a marker to print a blank at the end
if len(pages) & 1:
    pages.append(pages[0])

bigpages = []
while len(pages) > 2:
    bigpages.append(fixpage(pages.pop(), pages.pop(0)))
    bigpages.append(fixpage(pages.pop(0), pages.pop()))

bigpages += pages

PdfWriter().addpages(bigpages).write(outfn)
Example #16
0
def test_links():
    pdf_bytes = FakeHTML(string='<body>').write_pdf()
    assert PdfReader(fdata=pdf_bytes).Root.Pages.Kids[0].Annots is None

    pdf_bytes = FakeHTML(
        string='''
        <style>
            body { margin: 0; font-size: 10pt; line-height: 2 }
            p { display: block; height: 90pt; margin: 0 0 10pt 0 }
            img { width: 30pt; vertical-align: top }
        </style>
        <p><a href="http://weasyprint.org"><img src=pattern.png></a></p>
        <p style="padding: 0 10pt"><a
            href="#lipsum"><img style="border: solid 1pt"
                                src=pattern.png></a></p>
        <p id=hello>Hello, World</p>
        <p id=lipsum>
            <a style="display: block; page-break-before: always; height: 30pt"
               href="#hel%6Co"></a>
        </p>
    ''',
        base_url=resource_filename('<inline HTML>')).write_pdf()
    links = [
        annot for page in PdfReader(fdata=pdf_bytes).Root.Pages.Kids
        for annot in page.Annots
    ]

    # 30pt wide (like the image), 20pt high (like line-height)
    assert links[0].A == {
        '/URI': '(http://weasyprint.org)',
        '/S': '/URI',
        '/Type': '/Action'
    }
    assert [round(float(value))
            for value in links[0].Rect] == [0, TOP, 30, TOP - 20]
    # The image itself: 30*30pt
    assert links[1].A == {
        '/URI': '(http://weasyprint.org)',
        '/S': '/URI',
        '/Type': '/Action'
    }
    assert [round(float(value))
            for value in links[1].Rect] == [0, TOP, 30, TOP - 30]

    # 32pt wide (image + 2 * 1pt of border), 20pt high
    assert links[2].A.S == '/GoTo'
    assert links[2].A.Type == '/Action'
    assert links[2].A.D[1] == '/XYZ'
    assert round(float(links[2].A.D[3])) == TOP
    assert [round(float(value)) for value in links[2].Rect
            ] == [10, TOP - 100, 10 + 32, TOP - 100 - 20]
    # The image itself: 32*32pt
    assert links[3].A.S == '/GoTo'
    assert links[3].A.Type == '/Action'
    assert links[3].A.D[1] == '/XYZ'
    assert round(float(links[3].A.D[3])) == TOP
    assert [round(float(value)) for value in links[3].Rect
            ] == [10, TOP - 100, 10 + 32, TOP - 100 - 32]

    # 100% wide (block), 30pt high
    assert links[4].A.S == '/GoTo'
    assert links[4].A.Type == '/Action'
    assert links[4].A.D[1] == '/XYZ'
    assert round(float(links[4].A.D[3])) == TOP - 200
    assert [round(float(value))
            for value in links[4].Rect] == [0, TOP, RIGHT, TOP - 30]

    # 100% wide (block), 0pt high
    pdf_bytes = FakeHTML(
        string='<a href="../lipsum" style="display: block">',
        base_url='http://weasyprint.org/foo/bar/').write_pdf()
    link, = [
        annot for page in PdfReader(fdata=pdf_bytes).Root.Pages.Kids
        for annot in page.Annots
    ]
    assert link.A == {
        '/URI': '(http://weasyprint.org/foo/lipsum)',
        '/S': '/URI',
        '/Type': '/Action',
    }
    assert [round(float(value)) for value in link.Rect] == [0, TOP, RIGHT, TOP]
Example #17
0
def test_embedded_files():
    with temp_directory() as absolute_tmp_dir:
        absolute_tmp_file = os.path.join(absolute_tmp_dir, 'some_file.txt')
        adata = b'12345678'
        with open(absolute_tmp_file, 'wb') as afile:
            afile.write(adata)
        absolute_url = path2url(absolute_tmp_file)
        assert absolute_url.startswith('file://')

        with temp_directory() as relative_tmp_dir:
            relative_tmp_file = os.path.join(relative_tmp_dir, 'äöü.txt')
            rdata = b'abcdefgh'
            with open(relative_tmp_file, 'wb') as rfile:
                rfile.write(rdata)

            pdf_bytes = FakeHTML(
                string='''
                    <title>Test document</title>
                    <meta charset="utf-8">
                    <link
                        rel="attachment"
                        title="some file attachment äöü"
                        href="data:,hi%20there">
                    <link rel="attachment" href="{0}">
                    <link rel="attachment" href="{1}">
                    <h1>Heading 1</h1>
                    <h2>Heading 2</h2>
                '''.format(absolute_url, os.path.basename(relative_tmp_file)),
                base_url=relative_tmp_dir,
            ).write_pdf(attachments=[
                Attachment('data:,oob attachment', description='Hello'),
                'data:,raw URL',
                io.BytesIO(b'file like obj')
            ])
    pdf = PdfReader(fdata=pdf_bytes)
    embedded = pdf.Root.Names.EmbeddedFiles.Names

    assert zlib.decompress(
        embedded[1].EF.F.stream.encode('latin-1')) == b'hi there'
    assert embedded[1].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(b'hi there').hexdigest()))
    assert embedded[1].F.decode() == ''
    assert embedded[1].UF.decode() == 'attachment.bin'
    assert embedded[1].Desc.decode() == 'some file attachment äöü'

    assert zlib.decompress(
        embedded[3].EF.F.stream.encode('latin-1')) == b'12345678'
    assert embedded[3].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(adata).hexdigest()))
    assert embedded[3].UF.decode() == os.path.basename(absolute_tmp_file)

    assert zlib.decompress(
        embedded[5].EF.F.stream.encode('latin-1')) == b'abcdefgh'
    assert embedded[5].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(rdata).hexdigest()))
    assert embedded[5].UF.decode() == os.path.basename(relative_tmp_file)

    assert zlib.decompress(
        embedded[7].EF.F.stream.encode('latin-1')) == b'oob attachment'
    assert embedded[7].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(b'oob attachment').hexdigest()))
    assert embedded[7].Desc.decode() == 'Hello'

    assert zlib.decompress(
        embedded[9].EF.F.stream.encode('latin-1')) == b'raw URL'
    assert embedded[9].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(b'raw URL').hexdigest()))

    assert zlib.decompress(
        embedded[11].EF.F.stream.encode('latin-1')) == b'file like obj'
    assert embedded[11].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(b'file like obj').hexdigest()))

    pdf_bytes = FakeHTML(string='''
        <title>Test document 2</title>
        <meta charset="utf-8">
        <link
            rel="attachment"
            href="data:,some data">
    ''').write_pdf()
    pdf = PdfReader(fdata=pdf_bytes)
    embedded = pdf.Root.Names.EmbeddedFiles.Names

    assert embedded[1].EF.F.Params.CheckSum == ('<{}>'.format(
        hashlib.md5(b'some data').hexdigest()))

    pdf_bytes = FakeHTML(string='''
        <title>Test document 3</title>
        <meta charset="utf-8">
        <h1>Heading</h1>
    ''').write_pdf()
    pdf = PdfReader(fdata=pdf_bytes)
    assert pdf.Root.Names is None
    assert pdf.Root.Outlines is not None

    pdf_bytes = FakeHTML(string='''
        <title>Test document 4</title>
        <meta charset="utf-8">
    ''').write_pdf()
    pdf = PdfReader(fdata=pdf_bytes)
    assert pdf.Root.Names is None
    assert pdf.Root.Outlines is None
Example #18
0
import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge

# Get all the filenames

argv = sys.argv[1:]
underneath = '-u' in argv
if underneath:
    del argv[argv.index('-u')]
inpfn, wmarkfn = argv
outfn = 'watermark.' + os.path.basename(inpfn)

# Open both the source files
wmark_trailer = PdfReader(wmarkfn)
trailer = PdfReader(inpfn)

# Handle different sized pages in same document with
# a memoization cache, so we don't create more watermark
# objects than we need to (typically only one per document).

wmark_page = wmark_trailer.pages[0]
wmark_cache = {}

# Process every page
for pagenum, page in enumerate(trailer.pages, 1):

    # Get the media box of the page, and see
    # if we have a matching watermark in the cache
    mbox = tuple(float(x) for x in page.MediaBox)
Example #19
0
import sys
import os
import math

from music21 import *

from pdfrw import PdfReader, PdfWriter, PdfTokens
from pdfrw.findobjs import page_per_xobj

CLEF_MAPPING = {"(&)": clef.TrebleClef,
                "(V)": clef.Treble8vbClef,
                "(?)": clef.BassClef}

inpfn, = sys.argv[1:]
outfn = 'extract.' + os.path.basename(inpfn)
doc = PdfReader(inpfn)
page = doc.pages[0]
# page.Contents.stream = page.Contents.stream[:21000]
tokens = PdfTokens(page.Contents.stream)
indent = 0
commands = ["q", "Q", "ET", "BT", "cm", "Tm", "Tf", "s", "m", "l", "S", "TJ", "f", "Tj", "k", "re", "W", "n", "K", "w", "c"]
params = []
items = []
subcommands = []
for token in tokens:
    if token == "q":
        indent += 1
        if subcommands:
            items.append(subcommands)
        subcommands = []
    elif token == "Q":
Example #20
0
# Lecture data
with open( WorkingFolder+fillDataFile,'r') as f:
    dataJS = json.loads(f.read(), object_pairs_hook=OrderedDict) # import the json data as an ordered dictionary
f.close()

# Checking that entries correspond
for iData in dataJS:
    if not iData in  confCerfaJS:
        print("The field: " + iData + " has no defined configuration in the conf file")
        dataJS.pop(iData) # delete this entry


###############################################
###############################################
# Get the document size, we assume that all the pages have the same size
PdfIn = PdfReader(basePDF)
sizePdf = PdfIn.Root.Pages.Kids[0].MediaBox
sizeCerfa = (float(sizePdf[2]),float(sizePdf[3]))

###############################################
###############################################
# Remplissage Cerfa

# Count how many pages are filled in
nbPage = 0
for iconfig in confCerfaJS:
    if nbPage < confCerfaJS[iconfig]["position"]["page"]:
        nbPage = confCerfaJS[iconfig]["position"]["page"]

print("Building the filled cerfa")
fillForm()
            "dismissed_dates": "; ".join(dismissed_dates),
            "conviction_charges": "; ".join(conviction_names),
            "conviction_arrest_dates": "; ".join(conviction_arrest_dates),
            "conviction_dates": "; ".join(conviction_dates),
            "defendant_name": case.summary.name,
            "county": case.summary.location,
            **FormFilling._build_six_charges(convictions + dismissals),
        }
        form = from_dict(data_class=FormData, data=form_data_dict)
        location = case.summary.location.lower()
        warning = FormFilling._warn_charge_count_overflow(
            location, convictions, dismissals)
        if warning:
            warnings.append(warning)
        pdf_path = FormFilling._build_pdf_path(location, convictions)
        pdf = PdfReader(pdf_path)
        for field in pdf.Root.AcroForm.Fields:
            field_name = field.T.lower().replace(" ", "_").replace("(",
                                                                   "").replace(
                                                                       ")", "")
            field_value = getattr(form, field_name)
            field.V = field_value
            warnings += FormFilling._set_font(field, field_value)
        for page in pdf.pages:
            annotations = page.get("/Annots")
            if annotations:
                for annotation in annotations:
                    annotation.update(PdfDict(AP=""))
        pdf.Root.AcroForm.update(PdfDict(NeedAppearances=PdfObject("true")))
        return pdf, warnings
Example #22
0
    parser.add_option('-d',
                      dest='pdfdir',
                      help='watermark all pdf files in this directory')
    parser.add_option('-o',
                      dest='outdir',
                      help='outputdir used with option -d',
                      default='tmp')
    parser.add_option('-s',
                      dest='skip_pages',
                      help='page numbers to be skipped -d',
                      default='')
    options, args = parser.parse_args()

    if options.input_fname and options.watermark_fname:
        watermark = pagexobj(
            PdfReader(options.watermark_fname, decompress=False).pages[0])
        outfn = 'watermark.' + os.path.basename(options.input_fname)
        pages = PdfReader(options.input_fname, decompress=False).pages

        skip_pages = [
            int(y) - 1
            for y in filter(lambda x: x != '', options.skip_pages.split(','))
        ]
        new_pages = []
        for i in range(len(pages)):
            if i in skip_pages:
                new_pages.append(pages[i])
            else:
                new_pages.append(fixpage(pages[i], watermark))

        PdfWriter().addpages(new_pages).write(outfn)
Example #23
0
#!/usr/bin/env python
'''
usage:   alter.py my.pdf

Creates alter.my.pdf

Demonstrates making a slight alteration to a preexisting PDF file.

'''

import sys
import os

from pdfrw import PdfReader, PdfWriter

inpfn, = sys.argv[1:]
outfn = 'alter.' + os.path.basename(inpfn)

trailer = PdfReader(inpfn)
trailer.Info.Title = 'My New Title Goes Here'
writer = PdfWriter()
writer.trailer = trailer
writer.write(outfn)
Example #24
0
def watermark(input_fname, watermark_fname, output_fname=None):
    outfn = output_fname or ('watermark.' + os.path.basename(input_fname))
    w = pagexobj(PdfReader(watermark_fname, decompress=False).pages[0])
    pages = PdfReader(input_fname, decompress=False).pages
    PdfWriter().addpages([fixpage(x, w) for x in pages]).write(outfn)
    return outfn
Example #25
0
import sys
import os
import time
from pdfrw import PdfReader, PdfWriter, PageMerge

BOOKLET_SIZE = 20
START = time.time()

def fixpage(*pages):
    result = PageMerge() + (x for x in pages if x is not None)
    result[-1].x += result[0].w
    return result.render()

INPFN, = sys.argv[1:]
OUTFN = 'booklet.' + os.path.basename(INPFN)
ALL_IPAGES = PdfReader(INPFN).pages
print 'The pdf file '+str(INPFN)+' has '+str(len(ALL_IPAGES))+' pages.'

#Make sure we have an even number
if len(ALL_IPAGES) & 1:
    ALL_IPAGES.append(None)
    print 'Inserting one more blank page to make pages number even.'
NUM_OF_ITER, ITERS_LEFT = divmod(len(ALL_IPAGES), BOOKLET_SIZE)

print 'Making '+str(NUM_OF_ITER)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
opages = []
for iteration in range(0, NUM_OF_ITER):
    ipages = ALL_IPAGES[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
    while len(ipages) > 2:
        opages.append(fixpage(ipages.pop(), ipages.pop(0)))
        opages.append(fixpage(ipages.pop(0), ipages.pop()))
Example #26
0
    # "data": [-85.8635, 38.2927, -85.6, 38.7]
    #},
    "export": {
        "width": 10000,
        #"prettyprint": True,
    }
}

# TODO: Add css
# Because I don't want to deal with getting everything else in python 3 right now
# I use subprocess, lame, I know.

k.generate(config, outfile='map.svg')
subprocess.call(['cairosvg', '-o', 'map.pdf', 'map.svg', '--unsafe'])

ipdf = PdfReader('map.pdf')
p = ipdf.pages[0]
pdf = PdfWriter()

# width_to_height_ratio = 1.3

# 10 pages wide is 85"
# 2 pages down is 22"

# It's printing at 10.75" x 6.5"

width_pages = 8
height_pages = 5

index = squares(width_pages, height_pages)
    def run(self):
        try:
            filename, _ = os.path.splitext(self.data['sourcefile'])
            folder = os.path.dirname(self.data['sourcefile'])

            template = PdfReader("template.pdf", decompress=False).pages[0]
            template_obj = pagexobj(template)

            with open(self.data['sourcefile'], 'r', newline='') as f:
                reader = csv.DictReader(f)

                for n, row in enumerate(reader, 1):
                    fn = f'{filename}-{n}.pdf'
                    outfile = os.path.join(folder, fn)
                    canvas = Canvas(outfile)

                    xobj_name = makerl(canvas, template_obj)
                    canvas.doForm(xobj_name)

                    ystart = 443

                    # Prepared by
                    canvas.drawString(170, ystart, row.get('name', ''))

                    # Date: Todays date
                    today = datetime.today()
                    canvas.drawString(410, ystart, today.strftime('%F'))

                    # Device/Program Type
                    canvas.drawString(230, ystart - 28,
                                      row.get('program_type', ''))

                    # Product code
                    canvas.drawString(175, ystart - (2 * 28),
                                      row.get('product_code', ''))

                    # Customer
                    canvas.drawString(315, ystart - (2 * 28),
                                      row.get('customer', ''))

                    # Vendor
                    canvas.drawString(145, ystart - (3 * 28),
                                      row.get('vendor', ''))

                    ystart = 250

                    # Program Language
                    canvas.drawString(210, ystart, "Python")

                    canvas.drawString(430, ystart, row.get('n_errors', ''))

                    comments = row.get('comments', '').replace('\n', ' ')
                    if comments:
                        lines = textwrap.wrap(comments, width=65)  # 45
                        first_line = lines[0]
                        remainder = ' '.join(lines[1:])

                        lines = textwrap.wrap(remainder, 75)  # 55
                        lines = lines[:
                                      4]  # max lines, not including the first.

                        canvas.drawString(155, 223, first_line)
                        for n, l in enumerate(lines, 1):
                            canvas.drawString(80, 223 - (n * 28), l)

                    canvas.save()

        except Exception as e:

            self.signals.error.emit(str(e))
            return

        self.signals.finished.emit()
Example #28
0
def merge_pdfs(files, out):
    writer = PdfWriter()
    for inpfn in files:
        writer.addpages(PdfReader(inpfn).pages)
    writer.write(out)
from pdfrw import PdfReader
import glob

URL_PREFIX = 'https://some-url.com/'

outContent = ''
outYaml = 'books:\n'

for pdf_filename in glob.glob('*.pdf'):
    mReader = PdfReader(pdf_filename)
    mPublisher = (mReader.Info.Creator).decode()
    mTitle = (mReader.Info.Title).decode()
    mUrl = URL_PREFIX + pdf_filename

    outContent += '- [%s](%s){:target="_blank"}\n' % (mTitle, mUrl)
    outYaml += '  - title: "%s"\n    link: "%s"\n' % (mTitle, mUrl)

print(outYaml)
print(outContent)
Example #30
0
'''
usage:   unspread.py my.pdf

Creates unspread.my.pdf

Chops each page in half, e.g. if a source were
created in booklet form, you could extract individual
pages.
'''

import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge


def splitpage(src):
    ''' Split a page into two (left and right)
    '''
    # Yield a result for each half of the page
    for x_pos in (0, 0.5):
        yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render()


inpfn, = sys.argv[1:]
outfn = 'unspread.' + os.path.basename(inpfn)
writer = PdfWriter()
for page in PdfReader(inpfn).pages:
    writer.addpages(splitpage(page))
writer.write(outfn)
Example #31
0
#! /usr/bin/env python
# encoding: utf-8

from pdfrw import PdfReader, IndirectPdfDict, BookmarkedPdfWriter
from datetime import datetime

output = BookmarkedPdfWriter()

for i in xrange(3):
    totalPages = len(output.pagearray)
    output.addpages(
        PdfReader(
            'static_pdfs/global/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf').pages)

    bmname = 'Bm (%s) - %s' % (i + 1, 'Root')

    t1 = output.addBookmark(bmname, totalPages)
    t2 = output.addBookmark("Child 1", totalPages + 1, t1)
    output.addBookmark("Child 1.1", totalPages + 2, t2)

now = datetime.utcnow()
date = 'D:%04d%02d%02d%02d%02d%02d' % (now.year, now.month, now.day, now.hour,
                                       now.minute, now.second)

info = output.trailer.Info = IndirectPdfDict()
info.Title = 'Test PDF with Bookmarks'
info.Author = 'asdasd'
info.Creator = 'random dude'
info.Producer = 'another random dude'
info.CreationDate = date
Example #32
0
#with open('moveis.csv') as csv_file:
#    csv_reader = csv.reader(csv_file, delimiter=',')
#    for row in csv_reader:
#        matricula = row[0]
#        nome = row[1]
#        gendocument(matricula,nome)

for row in csv_mime:
    matricula = row[0]
    nome = row[1]
    gendocument(matricula, nome)
    genquestoes(matricula, nome)

writerfdr = PdfWriter()
writerfdq = PdfWriter()

for inpfn in fdqlist:
    writerfdq.addpages(PdfReader(inpfn).pages)

for inpfn in fdrlist:
    writerfdr.addpages(PdfReader(inpfn).pages)

movetex()

with open('resultfdq.pdf', 'w+') as outfn:
    writerfdq.write(outfn)

with open('resultfdr.pdf', 'w+') as outfn:
    writerfdr.write(outfn)
Example #33
0
	def __init__(self, path):
		self.path = path
		self.file = PdfReader(path)
		self.fields_info = self.read_fields()
Example #34
0
out.close()

# be careful if using libreoffice for generating pdf, in olderversions via command line
# it can start differently like loffice libreoffice or similar
os.system('libreoffice5.0 --writer --headless --convert-to pdf %s ' %  (fname + '.txt'))


# Tnx to this answer for implementing footer and page numbering http://stackoverflow.com/a/28283732/2397101
# this code follows

input_file = fname + '.pdf'
output_file = fname.rstrip('d') + '.pdf'

# Get pages
reader = PdfReader(input_file)
pages = [pagexobj(p) for p in reader.pages]


# Compose new pdf
canvas = Canvas(output_file)

for page_num, page in enumerate(pages, start=1):

    # Add page
    canvas.setPageSize((page.BBox[2], page.BBox[3]))
    canvas.doForm(makerl(canvas, page))

    # Draw footer
    footer_text = "Page %s of %s" % (page_num, len(pages))
    x = 128
Example #35
0
def popups_read_pdf(file):
    from pdfrw import PdfReader
    global popup_pdf
    popup_pdf = PdfReader(file)
Example #36
0
        def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw):
            with open(f, "rb") as inf:
                orig_imgdata = inf.read()
            output = img2pdf.convert(orig_imgdata, nodate=True,
                                     with_pdfrw=with_pdfrw)
            from io import StringIO, BytesIO
            from pdfrw import PdfReader, PdfName, PdfWriter
            from pdfrw.py23_diffs import convert_load, convert_store
            x = PdfReader(StringIO(convert_load(output)))
            self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root,
                             PdfName.Size])
            self.assertEqual(x.Size, '7')
            self.assertEqual(x.Info, {})
            self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages,
                                                     PdfName.Type])
            self.assertEqual(x.Root.Type, PdfName.Catalog)
            self.assertEqual(sorted(x.Root.Pages.keys()),
                             [PdfName.Count, PdfName.Kids, PdfName.Type])
            self.assertEqual(x.Root.Pages.Count, '1')
            self.assertEqual(x.Root.Pages.Type, PdfName.Pages)
            self.assertEqual(len(x.Root.Pages.Kids), 1)
            self.assertEqual(sorted(x.Root.Pages.Kids[0].keys()),
                             [PdfName.Contents, PdfName.MediaBox,
                              PdfName.Parent, PdfName.Resources, PdfName.Type])
            self.assertEqual(x.Root.Pages.Kids[0].MediaBox,
                             ['0', '0', '115', '48'])
            self.assertEqual(x.Root.Pages.Kids[0].Parent, x.Root.Pages)
            self.assertEqual(x.Root.Pages.Kids[0].Type, PdfName.Page)
            self.assertEqual(x.Root.Pages.Kids[0].Resources.keys(),
                             [PdfName.XObject])
            self.assertEqual(x.Root.Pages.Kids[0].Resources.XObject.keys(),
                             [PdfName.Im0])
            self.assertEqual(x.Root.Pages.Kids[0].Contents.keys(),
                             [PdfName.Length])
            self.assertEqual(x.Root.Pages.Kids[0].Contents.Length,
                             str(len(x.Root.Pages.Kids[0].Contents.stream)))
            self.assertEqual(x.Root.Pages.Kids[0].Contents.stream,
                             "q\n115.0000 0 0 48.0000 0.0000 0.0000 cm\n/Im0 "
                             "Do\nQ")

            imgprops = x.Root.Pages.Kids[0].Resources.XObject.Im0

            # test if the filter is valid:
            self.assertIn(
                imgprops.Filter, [[PdfName.DCTDecode], [PdfName.JPXDecode],
                                  [PdfName.FlateDecode]])
            # test if the colorspace is valid
            self.assertIn(
                imgprops.ColorSpace, [PdfName.DeviceGray, PdfName.DeviceRGB,
                                      PdfName.DeviceCMYK])
            # test if the image has correct size
            orig_img = Image.open(f)
            self.assertEqual(imgprops.Width, str(orig_img.size[0]))
            self.assertEqual(imgprops.Height, str(orig_img.size[1]))
            # if the input file is a jpeg then it should've been copied
            # verbatim into the PDF
            if imgprops.Filter in [[PdfName.DCTDecode], [PdfName.JPXDecode]]:
                self.assertEqual(
                    x.Root.Pages.Kids[0].Resources.XObject.Im0.stream,
                    convert_load(orig_imgdata))
            elif imgprops.Filter == [PdfName.FlateDecode]:
                # otherwise, the data is flate encoded and has to be equal to
                # the pixel data of the input image
                imgdata = zlib.decompress(
                    convert_store(
                        x.Root.Pages.Kids[0].Resources.XObject.Im0.stream))
                colorspace = imgprops.ColorSpace
                if colorspace == PdfName.DeviceGray:
                    colorspace = 'L'
                elif colorspace == PdfName.DeviceRGB:
                    colorspace = 'RGB'
                elif colorspace == PdfName.DeviceCMYK:
                    colorspace = 'CMYK'
                else:
                    raise Exception("invalid colorspace")
                im = Image.frombytes(colorspace, (int(imgprops.Width),
                                                  int(imgprops.Height)),
                                     imgdata)
                if orig_img.mode == '1':
                    orig_img = orig_img.convert("L")
                elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"):
                    orig_img = orig_img.convert("RGB")
                self.assertEqual(im.tobytes(), orig_img.tobytes())
                # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have
                # the close() method
                try:
                    im.close()
                except AttributeError:
                    pass
            # now use pdfrw to parse and then write out both pdfs and check the
            # result for equality
            y = PdfReader(out)
            outx = BytesIO()
            outy = BytesIO()
            xwriter = PdfWriter()
            ywriter = PdfWriter()
            xwriter.trailer = x
            ywriter.trailer = y
            xwriter.write(outx)
            ywriter.write(outy)
            self.assertEqual(outx.getvalue(), outy.getvalue())
            # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
            # close() method
            try:
                orig_img.close()
            except AttributeError:
                pass
Example #37
0
        def handle(self, f=inputf, out=outputf, with_pdfrw=with_pdfrw):
            with open(f, "rb") as inf:
                orig_imgdata = inf.read()
            output = img2pdf.convert(orig_imgdata, nodate=True,
                                     with_pdfrw=with_pdfrw)
            from pdfrw import PdfReader, PdfName, PdfWriter
            from pdfrw.py23_diffs import convert_load, convert_store
            x = PdfReader(PdfReaderIO(convert_load(output)))
            self.assertEqual(sorted(x.keys()), [PdfName.Info, PdfName.Root,
                             PdfName.Size])
            self.assertIn(x.Root.Pages.Count, ('1', '2'))
            if len(x.Root.Pages.Kids) == '1':
                self.assertEqual(x.Size, '7')
                self.assertEqual(len(x.Root.Pages.Kids), 1)
            elif len(x.Root.Pages.Kids) == '2':
                self.assertEqual(x.Size, '10')
                self.assertEqual(len(x.Root.Pages.Kids), 2)
            self.assertEqual(x.Info, {})
            self.assertEqual(sorted(x.Root.keys()), [PdfName.Pages,
                                                     PdfName.Type])
            self.assertEqual(x.Root.Type, PdfName.Catalog)
            self.assertEqual(sorted(x.Root.Pages.keys()),
                             [PdfName.Count, PdfName.Kids, PdfName.Type])
            self.assertEqual(x.Root.Pages.Type, PdfName.Pages)
            orig_img = Image.open(f)
            for pagenum in range(len(x.Root.Pages.Kids)):
                # retrieve the original image frame that this page was
                # generated from
                orig_img.seek(pagenum)
                cur_page = x.Root.Pages.Kids[pagenum]

                ndpi = orig_img.info.get("dpi", (96.0, 96.0))
                # In python3, the returned dpi value for some tiff images will
                # not be an integer but a float. To make the behaviour of
                # img2pdf the same between python2 and python3, we convert that
                # float into an integer by rounding.
                # Search online for the 72.009 dpi problem for more info.
                ndpi = (int(round(ndpi[0])), int(round(ndpi[1])))
                imgwidthpx, imgheightpx = orig_img.size
                pagewidth = 72.0*imgwidthpx/ndpi[0]
                pageheight = 72.0*imgheightpx/ndpi[1]

                def format_float(f):
                    if int(f) == f:
                        return str(int(f))
                    else:
                        return ("%.4f" % f).rstrip("0")

                self.assertEqual(sorted(cur_page.keys()),
                                 [PdfName.Contents, PdfName.MediaBox,
                                  PdfName.Parent, PdfName.Resources,
                                  PdfName.Type])
                self.assertEqual(cur_page.MediaBox,
                                 ['0', '0', format_float(pagewidth),
                                  format_float(pageheight)])
                self.assertEqual(cur_page.Parent, x.Root.Pages)
                self.assertEqual(cur_page.Type, PdfName.Page)
                self.assertEqual(cur_page.Resources.keys(),
                                 [PdfName.XObject])
                self.assertEqual(cur_page.Resources.XObject.keys(),
                                 [PdfName.Im0])
                self.assertEqual(cur_page.Contents.keys(),
                                 [PdfName.Length])
                self.assertEqual(cur_page.Contents.Length,
                                 str(len(cur_page.Contents.stream)))
                self.assertEqual(cur_page.Contents.stream,
                                 "q\n%.4f 0 0 %.4f 0.0000 0.0000 cm\n"
                                 "/Im0 Do\nQ" % (pagewidth, pageheight))

                imgprops = cur_page.Resources.XObject.Im0

                # test if the filter is valid:
                self.assertIn(
                    imgprops.Filter, [PdfName.DCTDecode, PdfName.JPXDecode,
                                      PdfName.FlateDecode,
                                      [PdfName.CCITTFaxDecode]])

                # test if the image has correct size
                self.assertEqual(imgprops.Width, str(orig_img.size[0]))
                self.assertEqual(imgprops.Height, str(orig_img.size[1]))
                # if the input file is a jpeg then it should've been copied
                # verbatim into the PDF
                if imgprops.Filter in [PdfName.DCTDecode,
                                       PdfName.JPXDecode]:
                    self.assertEqual(
                        cur_page.Resources.XObject.Im0.stream,
                        convert_load(orig_imgdata))
                elif imgprops.Filter == [PdfName.CCITTFaxDecode]:
                    tiff_header = tiff_header_for_ccitt(
                        int(imgprops.Width), int(imgprops.Height),
                        int(imgprops.Length), 4)
                    imgio = BytesIO()
                    imgio.write(tiff_header)
                    imgio.write(convert_store(
                        cur_page.Resources.XObject.Im0.stream))
                    imgio.seek(0)
                    im = Image.open(imgio)
                    self.assertEqual(im.tobytes(), orig_img.tobytes())
                    try:
                        im.close()
                    except AttributeError:
                        pass

                elif imgprops.Filter == PdfName.FlateDecode:
                    # otherwise, the data is flate encoded and has to be equal
                    # to the pixel data of the input image
                    imgdata = zlib.decompress(
                        convert_store(cur_page.Resources.XObject.Im0.stream))
                    if imgprops.DecodeParms:
                        if orig_img.format == 'PNG':
                            pngidat, palette = img2pdf.parse_png(orig_imgdata)
                        elif orig_img.format == 'TIFF' \
                                and orig_img.info['compression'] == "group4":
                            offset, length = \
                                    img2pdf.ccitt_payload_location_from_pil(
                                            orig_img)
                            pngidat = orig_imgdata[offset:offset+length]
                        else:
                            pngbuffer = BytesIO()
                            orig_img.save(pngbuffer, format="png")
                            pngidat, palette = img2pdf.parse_png(
                                    pngbuffer.getvalue())
                        self.assertEqual(zlib.decompress(pngidat), imgdata)
                    else:
                        colorspace = imgprops.ColorSpace
                        if colorspace == PdfName.DeviceGray:
                            colorspace = 'L'
                        elif colorspace == PdfName.DeviceRGB:
                            colorspace = 'RGB'
                        elif colorspace == PdfName.DeviceCMYK:
                            colorspace = 'CMYK'
                        else:
                            raise Exception("invalid colorspace")
                        im = Image.frombytes(colorspace,
                                             (int(imgprops.Width),
                                              int(imgprops.Height)),
                                             imgdata)
                        if orig_img.mode == '1':
                            self.assertEqual(im.tobytes(),
                                             orig_img.convert("L").tobytes())
                        elif orig_img.mode not in ("RGB", "L", "CMYK",
                                                   "CMYK;I"):
                            self.assertEqual(im.tobytes(),
                                             orig_img.convert("RGB").tobytes())
                        # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does
                        # not have the close() method
                        try:
                            im.close()
                        except AttributeError:
                            pass
            # now use pdfrw to parse and then write out both pdfs and check the
            # result for equality
            y = PdfReader(out)
            outx = BytesIO()
            outy = BytesIO()
            xwriter = PdfWriter()
            ywriter = PdfWriter()
            xwriter.trailer = x
            ywriter.trailer = y
            xwriter.write(outx)
            ywriter.write(outy)
            self.assertEqual(compare_pdf(outx.getvalue(), outy.getvalue()), True)
            # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
            # close() method
            try:
                orig_img.close()
            except AttributeError:
                pass
Example #38
0
import sys
import os
import time
from pdfrw import PdfReader, PdfWriter, PageMerge

BOOKLET_SIZE = 20
start = time.time()

def fixpage(*pages):
    result = PageMerge() + (x for x in pages if x is not None)
    result[-1].x += result[0].w
    return result.render()

inpfn, = sys.argv[1:]
outfn = 'booklet.' + os.path.basename(inpfn)
allipages = PdfReader(inpfn).pages
print 'The pdf file '+str(inpfn)+' has '+str(len(allipages))+' pages.'

#Make sure we have an even number
if len(allipages) & 1:
   allipages.append(None)
   print 'Inserting one more blank page to make pages number even.'
num_of_iter, iters_left = divmod(len(allipages), BOOKLET_SIZE)

print 'Making '+str(num_of_iter)+' subbooklets of '+str(BOOKLET_SIZE)+' pages each.'
opages = []
for iteration in range(0,num_of_iter):
    ipages = allipages[iteration*BOOKLET_SIZE:(iteration+1)*BOOKLET_SIZE]
    while len(ipages) > 2:
        opages.append(fixpage(ipages.pop(), ipages.pop(0)))
        opages.append(fixpage(ipages.pop(0), ipages.pop()))
Example #39
0
def form_xo_reader(imgdata):
    page, = PdfReader(imgdata).pages
    return pagexobj(page)
Example #40
0

#
# ------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
file_in = sys.argv[1]
file_out = sys.argv[2]
sys.stderr.write(file_in + "\n")
sys.stderr.write(file_out + "\n")
#

cc = canvas.Canvas(file_out, pagesize=portrait(A4))
fontname_g = "HeiseiKakuGo-W5"
pdfmetrics.registerFont(UnicodeCIDFont(fontname_g))
#
page = PdfReader(file_in, decompress=False).pages
sys.stderr.write("len(page) = %d\n" % len(page))
#
pp = pagexobj(page[0])
page_1_proc(cc, pp, fontname_g)
#
# pp = pagexobj(page[1])
# page_2_proc(cc,pp,fontname_g)
# #
# pp = pagexobj(page[2])
# page_3_proc(cc,pp,fontname_g)
#
cc.save()
#
sys.stderr.write("*** 終了 ***\n")
# ------------------------------------------------------------------
Example #41
0
class TranslatedPdf(object):
    def __init__(self, pdfFile, translator, ttfFile=TTF_FILE):
        """            
                pdfFile is the file name or file object of the pdf file we want to translate
                translator is a unicode to unicode function
                ttfFile is the default ttf font file name used in translated pdf
        """
        try:
            self.pdf=PdfReader(pdfFile, decompress=False)
        except:
            print "Using pdftk to uncompress and decrypt"
            from subprocess import Popen, PIPE            
            cmd = ['pdftk', pdfFile, 'output', '-',  'uncompress']
            proc = Popen(cmd,stdout=PIPE)
            cmdout,cmderr = proc.communicate()
            if cmderr:
                print "Unable to open", pdfFile
                sys.exit(1)
            self.pdf=PdfReader(StringIO.StringIO(cmdout), decompress=False)
        self.decodeDicts={}
        self.font_list=[]
        self.dttf=fontTools.ttLib.TTFont(TTF_FILE) # default ttf
        if self.dttf['cmap'].getcmap(3,1) is None:        
            cmap10=self.dttf['cmap'].getcmap(3,10).cmap
            cmap={k&0xffff:v for k,v in cmap10.iteritems()}
        else:
            cmap=self.dttf['cmap'].getcmap(3,1).cmap
        self.ttf_cmap=cmap
        for n, page in enumerate(self.pdf.pages):
            print "Translating p", n , "\r",
            self._translatePage(page, translator)
        # translate fonts
        for font in self.font_list:            
            fontfile=font.DescendantFonts[0].FontDescriptor.FontFile2
            writeStream(fontfile, file(TTF_FILE,"rb").read())
            font.ToUnicode=None
        # translate info
        if self.pdf.has_key("/Info"):
            for k,v in self.pdf.Info.iteritems():
                tv=transPdfString(v, translator)
                if tv is not None:
                    self.pdf.Info[k]=tv                    
        # translate outlines        
        if self.pdf.Root.Outlines:
            array=[]
            array.append(self.pdf.Root.Outlines.First)
            while array:
                x=array.pop()
                if x.First:
                    array.append(x.First)
                if x.Next:
                    array.append(x.Next)                                
                tTitle=transPdfString(x.Title, translator)
                if tTitle is not None:
                    x.Title=tTitle                
                
    def _updatePageFontDecodeDicts(self, page):
            fonts=page.Resources.Font            
            for k, font in (fonts.iteritems() if fonts else []):                        
                font_id=_id(font)
                if font_id not in self.decodeDicts:                    
                    if font.has_key("/ToUnicode") and font.has_key("/DescendantFonts"):
                        print "Font translated:",k, autoDecode(font.BaseFont)
                        self.decodeDicts[font_id]=getFontDecodeDict(font)
                        self.font_list.append(font)
                    else:
                        print "Font not translated:", k, autoDecode(font.BaseFont)
                        self.decodeDicts[font_id]=None    
                        
    def saveAs(self, fname):
        opdf=PdfWriter()
        #print type(opdf.trailer), type(opdf.trailer.Info), type(opdf.trailer.Info.Author)
        opdf.addpages(self.pdf.pages)        
        opdf.trailer.Info=self.pdf.Info
        opdf.trailer.Root.Outlines=self.pdf.Root.Outlines
        opdf.write(fname)

    def _translatePage(self, page, translator):
        def handleText(encoded_text, decodeDict):
            if not decodeDict: 
                return encoded_text #unable to decode the text
            if encoded_text[0]!='<': # Unhandled case, never happend
                print encoded_text[:]
                return encoded_text
            b=encoded_text.decode()
            utext0=u""
            b0=b            
            while len(b):
                code = ord(b[0])*256+ord(b[1]) if len(b)>1 else ord(b[0])
                if decodeDict.has_key(code):
                    utext0 += decodeDict[code]
                else:
                    utext0 += "??"
                    print "\n??", hex(code), [hex(ord(x)) for x in str(b0)]                    
                b=b[2:]
            utext=translator(utext0)                
            gid_array=[]
            for x in utext:
                try:
                    name=self.ttf_cmap[ord(x)]
                    gid=self.dttf.getGlyphID(name)
                    gid_array.append(gid)
                except:
                    print "no gid%d"%ord(x), x                
            return "<"+"".join("%04X"%gid for gid in gid_array)+">"             
        self._updatePageFontDecodeDicts(page)
        output=""
        contents=page.Contents
        fonts=page.Resources.Font
        tokens=PdfTokens(readStream(contents))
        operands=[]
        decodeDict=None
        for tok in tokens:
            if str.isalpha(tok[0]) or tok[0] in ['"', "'"]:            
                if tok=='Tf':
                    font_name=operands[0]                    
                    decodeDict=self.decodeDicts[_id(fonts[font_name])]
                elif tok=="Tj":                    
                    operands[0]=handleText(operands[0], decodeDict)                    
                elif tok=="TJ":                
                    for n,t in enumerate(operands[1:]):
                        if t==']':
                            break
                        try:
                            tokNum=float(t)
                        except:
                            tokNum=None
                        if tokNum==None:
                            operands[n+1]=handleText(t, decodeDict)
                output += " ".join(operands+[tok]) + "\n"            
                operands=[]            
            else:
                operands.append(tok)    
        writeStream(contents, output)
Example #42
0
'''
usage:   copy.py my.pdf

Creates copy.my.pdf

Uses somewhat-functional parser.  For better results
for most things, see the Form XObject-based method.

'''

import sys
import os

from reportlab.pdfgen.canvas import Canvas

from decodegraphics import parsepage
from pdfrw import PdfReader, PdfWriter, PdfArray

inpfn, = sys.argv[1:]
outfn = 'copy.' + os.path.basename(inpfn)
pages = PdfReader(inpfn, decompress=True).pages
canvas = Canvas(outfn, pageCompression=0)

for page in pages:
    box = [float(x) for x in page.MediaBox]
    assert box[0] == box[1] == 0, "demo won't work on this PDF"
    canvas.setPageSize(box[2:])
    parsepage(page, canvas)
    canvas.showPage()
canvas.save()
Example #43
0
File: cat.py Project: ytaler/pdfrw
Creates cat.<first.pdf>

This file demonstrates two features:

1) Concatenating multiple input PDFs.

2) adding metadata to the PDF.

'''

import sys
import os

from pdfrw import PdfReader, PdfWriter, IndirectPdfDict

inputs = sys.argv[1:]
assert inputs
outfn = 'cat.' + os.path.basename(inputs[0])

writer = PdfWriter()
for inpfn in inputs:
    writer.addpages(PdfReader(inpfn).pages)

writer.trailer.Info = IndirectPdfDict(
    Title='your title goes here',
    Author='your name goes here',
    Subject='what is it all about?',
    Creator='some script goes here',
)
writer.write(outfn)
Example #44
0
'''

import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge


def fixpage(*pages):
    result = PageMerge() + (x for x in pages if x is not None)
    result[-1].x += result[0].w
    return result.render()


inpfn, = sys.argv[1:]
outfn = 'booklet.' + os.path.basename(inpfn)
ipages = PdfReader(inpfn).pages

# Make sure we have an even number
if len(ipages) & 1:
    ipages.append(None)

opages = []
while len(ipages) > 2:
    opages.append(fixpage(ipages.pop(), ipages.pop(0)))
    opages.append(fixpage(ipages.pop(0), ipages.pop()))

opages += ipages

PdfWriter().addpages(opages).write(outfn)
Example #45
0
Creates watermark.my.pdf, with every page overlaid with
first page from single_page.pdf.  If -u is selected, watermark
will be placed underneath page (painted first).

NOTE 1: This program assumes that all pages (including the watermark
        page) are the same size.  For other possibilities, see
        the fancy_watermark.py example.

NOTE 2: At one point, this example was extremely complicated, with
        multiple options.  That only led to errors in implementation,
        so it has been re-simplified in order to show basic principles
        of the library operation and to match the other examples better.
'''

import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge

argv = sys.argv[1:]
underneath = '-u' in argv
if underneath:
    del argv[argv.index('-u')]
inpfn, wmarkfn = argv
outfn = 'watermark.' + os.path.basename(inpfn)
wmark = PageMerge().add(PdfReader(wmarkfn).pages[0])[0]
trailer = PdfReader(inpfn)
for page in trailer.pages:
    PageMerge(page).add(wmark, prepend=underneath).render()
PdfWriter(outfn, trailer=trailer).write()
Example #46
0
def main():
    args = parse_arguments()

    # validate cli arguments
    infile = core.validate_infile(args.PDF)
    signature_length = core.validate_signature_length(args.signature_length)
    papersize = core.validate_papersize(args.paperformat, args.unit)
    pages_per_sheet = core.validate_pages_per_sheet(args.nup)

    # read pdf file
    inpages = PdfReader(infile).pages

    page_count = len(inpages)

    # calculate signature length, if not set manually through cli argument
    if signature_length == 0:
        # signatures are disabled, just pad to multiple of 4
        signature_length = page_count + core.reverse_remainder(page_count, 4)
    if signature_length < 0:
        # calculate signature length
        signature_length = core.calculate_signature_length(page_count)

    signature_count = math.ceil(page_count / signature_length)

    # pad with blank pages
    blank_pages_count = signature_length * signature_count - page_count
    if blank_pages_count:
        inpages.extend(
            [core.create_blank_copy(inpages[0])] * blank_pages_count)

    # calculate output size of single page for centering content
    output_size = 0
    if papersize and args.center_subpage:
        output_size = core.calculate_scaled_sub_page_size(
            pages_per_sheet, papersize)

    # impose and merge pages, creating sheets
    sheets = core.impose_and_merge(inpages, signature_length, pages_per_sheet,
                                   output_size, args.binding)

    # add divider pages
    if args.divider:
        sheets = core.add_divider(sheets, signature_length)

    # resize result
    if papersize:
        sheets = core.resize(sheets, papersize)

    # print infos
    if args.verbose:
        for line in textwrap.wrap(
            "Standard paper formats: {}".format(
                ', '.join(sorted(core.paperformats.keys()))), 80):
            print(line)
        print("Total input page:  {:>3}".format(page_count))
        print("Total output page: {:>3}".format(len(sheets)))

        input_size = inpages[0].MediaBox[2:]
        output_size = sheets[0].MediaBox[2:]
        print("Input size:        {}x{}".format(input_size[0], input_size[1]))
        print("Output size:       {}x{}".format(int(output_size[0]),
                                                int(output_size[1])))

        print("Signature length:  {:>3}".format(signature_length))
        print("Signature count:   {:>3}".format(signature_count))
        divider_count = 2*signature_count - 2 if args.divider else 0
        print("Divider pages:     {:>3}".format(divider_count))

    # save imposed pdf
    core.save_pdf(infile, sheets)
    print("Imposed PDF file saved to {}".format(core.create_filename(infile)))