Beispiel #1
0
def duplicated_pdf(stream):
    """Creates a duplicated pdf, from html stream (A.K.A. StringIO)"""

    o_text = "<center><h3>-- Original --</h3></center>"
    c_text = "<center><h3>-- Duplicado --</h3></center>"
    pdf_conv = html_to_pdf.HTMLToPDFConverter()

    original = PdfFileReader(StringIO(pdf_conv.convert(stream, o_text, o_text)))

    stream.seek(0)
    copy = PdfFileReader(StringIO(pdf_conv.convert(stream, c_text, c_text)))

    out = PdfFileWriter()
    for n in xrange(0, original.getNumPages()):
        out.addPage(original.getPage(n))

    for n in xrange(0, copy.getNumPages()):
        out.addPage(copy.getPage(n))

    encoded_pdf = StringIO()
    out.write(encoded_pdf)

    encoded_pdf.seek(0)
    encoded_pdf = encoded_pdf.read()

    return encoded_pdf
Beispiel #2
0
def duplicated_pdf(stream):
    """Creates a duplicated pdf, from html stream (A.K.A. StringIO)"""

    o_text = "<center><h3>-- Original --</h3></center>"
    c_text = "<center><h3>-- Duplicado --</h3></center>"
    pdf_conv = html_to_pdf.HTMLToPDFConverter()

    original = PdfFileReader(StringIO(pdf_conv.convert(stream, o_text,
                                                       o_text)))

    stream.seek(0)
    copy = PdfFileReader(StringIO(pdf_conv.convert(stream, c_text, c_text)))

    out = PdfFileWriter()
    for n in xrange(0, original.getNumPages()):
        out.addPage(original.getPage(n))

    for n in xrange(0, copy.getNumPages()):
        out.addPage(copy.getPage(n))

    encoded_pdf = StringIO()
    out.write(encoded_pdf)

    encoded_pdf.seek(0)
    encoded_pdf = encoded_pdf.read()

    return encoded_pdf
Beispiel #3
0
def _dl_ctrl_list():
    url = _get_ctrl_list_url()
    if not url:
        logging.error("Could not extract Control List PDF URL")
        return False

    url = config.sia_base + url

    logging.info("Downloading Control List PDF")
    try:
        ret = urllib2.urlopen(url)
    except(urllib2.URLError):
        logging.info("An error occurred when downloading Control List PDF %s" % (url))
        return False

    f = open(config.pdf_dst, 'wb')
    f.write(ret.read())
    f.close()

    try:
        pdf = PdfFileReader(file(config.pdf_dst))
    except(IOError, PdfReadError):
        logging.info("An error occurred when attempting to open the PDF")
        return False

    if not config.pdf_pages[0] <= pdf.getNumPages() <= config.pdf_pages[1]:
        logging.info("PDF page number %d is out of range" % (pdf.getNumPages()))
        return False

    return True
Beispiel #4
0
def main(output_file, input_files):
    print "****** \"" + output_file + "\" への書き込み開始 ******"

    output = PdfFileWriter()
    total_pages = 0

    for f in input_files:
        # expect filename as "*.pdf"
        if f[-4:] != ".pdf":
            print "skipped file: ", f
            continue
        else:
            input = PdfFileReader(file(f, 'rb'))
            num_pages = input.getNumPages()
            total_pages += num_pages
            print f, "->", str(num_pages) + " ページ"
            for i in xrange(0, num_pages):
                output.addPage(input.getPage(i))

    outputStream = file(output_file, 'wb')
    output.write(outputStream)
    print total_pages, "ページ 書き込み"
    outputStream.close()

    print
    print "### チェック ###"

    resultFile = PdfFileReader(file(output_file, 'rb'))
    num_pages = resultFile.getNumPages()
    print output_file, "->", str(num_pages), "ページあります"

    print "****** \"" + output_file + "\" への書き込み完了 ******"
Beispiel #5
0
def comparePDF(pf, sf):
    reload(sys)
    sys.setdefaultencoding("UTF-8")
    error_flag = False
    error_info = ""
    if (pf == None and sf == None):
        print "Prod and Stage WS PDF DON'T EXIST!"
    elif (pf != None and sf == None):
        error_flag = True
        error_info = error_info + "Stage WS PDF DOESN'T EXIST!"
    elif (pf == None and sf != None):
        error_flag = True
        error_info = error_info + "Prod WS PDF DOESN'T EXIST!"
    else:
        pPdf = PdfFileReader(file(pf, "rb"))
        sPdf = PdfFileReader(file(sf, "rb"))
        if (pPdf.getNumPages() != sPdf.getNumPages()):
            error_flag = True
            error_info = error_info + (
                "Diff WS PDF Page Numbers! ---stage pages: %d---prod pages: %d \n"
                % (sPdf.getNumPages(), pPdf.getNumPages()))
        else:
            for p in range(0, sPdf.getNumPages()):
                if (pPdf.getPage(p).extractText() !=
                        sPdf.getPage(p).extractText()):
                    error_flag = True
                    error_info = error_info + ("Diff WS PDF Page %d! \n" %
                                               (p + 1))
    if (error_flag):
        raise Exception, error_info
    else:
        print "Same WS PDF!"
Beispiel #6
0
def splitXPDF(pdfFileName):
    try:
        inputpdf = PdfFileReader(open(pdfFileName, "rb"))

        print '[+] Total Page : ' + str(inputpdf.getNumPages())

        setpath = pdfFileName[pdfFileName.find('\\') + 1:pdfFileName.find('.')]
        lstName = []
        with open("nameFile.base", "r") as nameFile:
            lstName = nameFile.read().split('\n')

        if (inputpdf.getNumPages() == len(lstName)):
            for i in xrange(inputpdf.numPages):
                output = PdfFileWriter()
                output.addPage(inputpdf.getPage(i))
                if (os.path.isdir('resault') != True):
                    os.mkdir('resault')
                if (os.path.isdir('resault\\' + setpath) != True):
                    os.mkdir('resault\\' + setpath)

                with open('resault\\' + setpath + '\\' + lstName[i] + '.pdf',
                          'wb') as outputStream:
                    output.write(outputStream)
                print '[+] Generate Page ' + str(
                    i + 1) + ' with File : ' + lstName[i] + '.pdf'

        else:
            print '[-] Number of Name in \'nameFile.base\' is not match with Number Page in PDF.'
    except IOError:
        print '[-] Cannot Openfile.'
Beispiel #7
0
    def test2(self):
        "Test generating several 'n-up' docs in 'legal' format."
        
        # minipages are squeezed, i.e. they lose their original page ratio...
        # needs to be addressed later...

        for path0 in ("samples/test-legal-p.pdf",):
            for n in (2, 4, 8, 9):
                outName = os.path.splitext(path0)[0] + "-%dup.pdf" % n
                path1 = os.path.join(".", outName)
                generateNup(path0, n, path1, verbose=False) # , dirs="UL")
    
                # assert output has correct number of pages
                input = PdfFileReader(file(path0, "rb"))
                np0 = input.getNumPages()
                input = PdfFileReader(file(path1, "rb"))
                np1 = input.getNumPages()
                self.assertEqual(np1, math.ceil(np0 / float(n)))
    
                # assert output page(s) has/have correct text content
                for pn in range(np1):
                    page = input.getPage(pn)
                    text = page.extractText().split()
                    exp = group([str(num) for num in range(np0)], n)[pn]
                    self.assertEqual(text, exp)
Beispiel #8
0
def merge(fppath, bppath, outputpath, no_delete, fed_backwards):
  fpfile = PdfFileReader(open(fppath))
  bpfile = PdfFileReader(open(bppath))

  outputfile = PdfFileWriter()

  outputpages = []
  for i in range(fpfile.getNumPages()):
    backpages = True
    try:
      outputpages.append(fpfile.getPage(i))
      if backpages:
        if fed_backwards:
          outputpages.append(bpfile.getPage(bpfile.getNumPages() - i - 1))
        else:
          outputpages.append(bpfile.getPage(i))
    except IndexError:
      backpages = False

  if not no_delete:
    outputpages = [page for page in outputpages if page.extractText() != '']

  [outputfile.addPage(page) for page in outputpages]

  outputfile.write(open(os.path.expanduser(outputpath), 'w'))
Beispiel #9
0
    def test2(self):
        "Test generating several 'n-up' docs in 'legal' format."

        # minipages are squeezed, i.e. they lose their original page ratio...
        # needs to be addressed later...

        for path0 in ("samples/test-legal-p.pdf", ):
            for n in (2, 4, 8, 9):
                outName = os.path.splitext(path0)[0] + "-%dup.pdf" % n
                path1 = os.path.join(".", outName)
                generateNup(path0, n, path1, verbose=False)  # , dirs="UL")

                # assert output has correct number of pages
                input = PdfFileReader(file(path0, "rb"))
                np0 = input.getNumPages()
                input = PdfFileReader(file(path1, "rb"))
                np1 = input.getNumPages()
                self.assertEqual(np1, math.ceil(np0 / float(n)))

                # assert output page(s) has/have correct text content
                for pn in range(np1):
                    page = input.getPage(pn)
                    text = page.extractText().split()
                    exp = group([str(num) for num in range(np0)], n)[pn]
                    self.assertEqual(text, exp)
Beispiel #10
0
    def test_concat_pdf_files( self ):
        try:
            os.unlink( r"docs/c.pdf" )
        except:
            pass
        self.assertTrue( True )
        input_a = PdfFileReader( file( r"docs/a.pdf", 'rb' ) )
        input_b = PdfFileReader( file( r"docs/b.pdf", 'rb' ) )

        output = PdfFileWriter()

        for x in range( 0, input_a.getNumPages() ):
            output.addPage( input_a.getPage( x ) )
        for x in range( 0, input_b.getNumPages() ):
            output.addPage( input_b.getPage( x ) )

        outputStream = file( r"docs/c.pdf", 'wb' )
        output.write( outputStream )
        outputStream.close()
        
        count = input_a.getNumPages() + input_b.getNumPages()
        
        check = PdfFileReader( file( r"docs/c.pdf", 'rb' ) )
        self.assertEqual( count, check.getNumPages() )
        os.unlink( r"docs/c.pdf" )
Beispiel #11
0
def getNum(files):
    pages = []
    for f in files:
        pdf = PdfFileReader(file(f, "rb"))
        print f, pdf.getNumPages()
        pages.append(pdf.getNumPages())

    print sum(pages), "all together"
Beispiel #12
0
    def add_terms_and_conditions(self, ids, original_report_pdf,
                                 original_report):

        terms_and_conditions_decoded = False
        default_terms_and_conditions_decoded = False


        user = self.env['res.users'].browse(self._uid)

        # todo change user language to report language (client language)

        language_field = original_report.terms_conditions_language_field
        model = original_report.model

        object = self.env[model].browse(ids)
        localdict = {'o': object}
        eval('document_language = o.%s' % language_field, localdict,
             mode="exec", nocopy=True)
        document_language = localdict.get('document_language',
                                          self._context.get('lang'))

        company = object.company_id
        # todo check language
        terms_and_conditions_list = company.terms_and_conditions

        for terms_and_conditions in terms_and_conditions_list:
            if terms_and_conditions.language == document_language:
                terms_and_conditions_decoded =\
                    base64.decodestring(terms_and_conditions.datas)
            if terms_and_conditions.language == 'default':
                default_terms_and_conditions_decoded = \
                    base64.decodestring(terms_and_conditions.datas)

        if not terms_and_conditions_decoded:
            terms_and_conditions_decoded = \
                default_terms_and_conditions_decoded or False

        if terms_and_conditions_decoded:
            writer = PdfFileWriter()
            stream_original_report = StringIO(original_report_pdf)
            reader_original_report = PdfFileReader(stream_original_report)
            stream_terms_and_conditions = StringIO(terms_and_conditions_decoded)
            reader_terms_and_conditions = PdfFileReader(
                stream_terms_and_conditions)
            for page in range(0, reader_original_report.getNumPages()):
                writer.addPage(reader_original_report.getPage(page))

            for page in range(0, reader_terms_and_conditions.getNumPages()):
                writer.addPage(reader_terms_and_conditions.getPage(page))

            stream_to_write = StringIO()
            writer.write(stream_to_write)

            combined_pdf = stream_to_write.getvalue()

            return combined_pdf
        else:
            return original_report_pdf
def imain(args):
    cells = []
    if not args.page:
        print 'p argument not passed. Converting all pages.'
        args.page = []
        pdf = PdfFileReader(open(args.infile,'rb'))
        print "Total Number of Pages in " + args.infile + " are " + str(pdf.getNumPages())
        for pg in range(1,pdf.getNumPages()+1):
            args.page.extend(str(pg))
    if args.checkcrop or args.checklines or args.checkdivs or args.checkcells:
        for pgs in args.page :
            print "Processing Page #" + pgs
            success = process_page(args.infile, pgs,
                bitmap=args.bitmap,
                checkcrop=args.checkcrop,
                checklines=args.checklines,
                checkdivs=args.checkdivs,
                checkcells=args.checkcells,
                whitespace=args.whitespace,
                boxes=args.boxes,
                greyscale_threshold=args.greyscale_threshold,
                page=args.page,
                crop=args.crop,
                line_length=args.line_length,
                bitmap_resolution=args.bitmap_resolution,
                name=args.name,
                pad=args.pad,
                white=args.white,
                black=args.black, outfilename=args.outfile)

    else:
        for pgs in args.page :
            print "Processing Page #" + pgs
            cells.extend(process_page(args.infile, pgs,
                bitmap=args.bitmap,
                checkcrop=args.checkcrop,
                checklines=args.checklines,
                checkdivs=args.checkdivs,
                checkcells=args.checkcells,
                whitespace=args.whitespace,
                boxes=args.boxes,
                greyscale_threshold=args.greyscale_threshold,
                page=args.page,
                crop=args.crop,
                line_length=args.line_length,
                bitmap_resolution=args.bitmap_resolution,
                name=args.name,
                pad=args.pad,
                white=args.white,
                black=args.black))

            filenames = dict()
            if args.outfile is None:
                args.outfile = sys.stdout
            filenames["{0}_filename".format(args.t)] = args.outfile
            output(cells, args.page, name=args.name, infile=args.infile, output_type=args.t, **filenames)
def makeOnePagersOld(filename='GPO-CONAN-REV-2014.pdf' ,path='pdf/'):
    infile = PdfFileReader(open(filename, 'rb'))
    print(infile.getNumPages())
    for i in range(infile.getNumPages()):
        p = infile.getPage(i)
        outfile = PdfFileWriter()
        outfile.addPage(p)
        outputStream = file(path+'pageindex-%02d.pdf' % i, 'wb')
        outfile.write(outputStream)
        outputStream.close()
    def _merge_pdf(self, documents, both_sides=False):
        print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        print "merge_pdf %s" % (both_sides)
        print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        """Merge PDF files into one.
 
        :param documents: list of path of pdf files
        :returns: path of the merged pdf
        """
        blankpdfstr = '''JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURl
                        Y29kZT4+CnN0cmVhbQp4nDPQM1Qo5ypUMFAwALJMLU31jBQsTAz1LBSKUrnCtRTyuAIVAIcdB3IK
                        ZW5kc3RyZWFtCmVuZG9iagoKMyAwIG9iago0MgplbmRvYmoKCjUgMCBvYmoKPDwKPj4KZW5kb2Jq
                        Cgo2IDAgb2JqCjw8L0ZvbnQgNSAwIFIKL1Byb2NTZXRbL1BERi9UZXh0XQo+PgplbmRvYmoKCjEg
                        MCBvYmoKPDwvVHlwZS9QYWdlL1BhcmVudCA0IDAgUi9SZXNvdXJjZXMgNiAwIFIvTWVkaWFCb3hb
                        MCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNlUkdCL0kgdHJ1ZT4+
                        L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUvUGFnZXMKL1Jlc291cmNl
                        cyA2IDAgUgovTWVkaWFCb3hbIDAgMCA1OTUgODQyIF0KL0tpZHNbIDEgMCBSIF0KL0NvdW50IDE+
                        PgplbmRvYmoKCjcgMCBvYmoKPDwvVHlwZS9DYXRhbG9nL1BhZ2VzIDQgMCBSCi9PcGVuQWN0aW9u
                        WzEgMCBSIC9YWVogbnVsbCBudWxsIDBdCi9MYW5nKGZyLUZSKQo+PgplbmRvYmoKCjggMCBvYmoK
                        PDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAw
                        NEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAwNjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzMwMDJFMDAz
                        NT4KL0NyZWF0aW9uRGF0ZShEOjIwMTIxMTAzMTQ0NzEwKzAxJzAwJyk+PgplbmRvYmoKCnhyZWYK
                        MCA5CjAwMDAwMDAwMDAgNjU1MzUgZiAKMDAwMDAwMDIyNiAwMDAwMCBuIAowMDAwMDAwMDE5IDAw
                        MDAwIG4gCjAwMDAwMDAxMzIgMDAwMDAgbiAKMDAwMDAwMDM2OCAwMDAwMCBuIAowMDAwMDAwMTUx
                        IDAwMDAwIG4gCjAwMDAwMDAxNzMgMDAwMDAgbiAKMDAwMDAwMDQ2NiAwMDAwMCBuIAowMDAwMDAw
                        NTYyIDAwMDAwIG4gCnRyYWlsZXIKPDwvU2l6ZSA5L1Jvb3QgNyAwIFIKL0luZm8gOCAwIFIKL0lE
                        IFsgPEYyMjBCNDlBNjRDOEEzRDY3QUFBQzNCODAwNkI5RkRDPgo8RjIyMEI0OUE2NEM4QTNENjdB
                        QUFDM0I4MDA2QjlGREM+IF0KL0RvY0NoZWNrc3VtIC83NzUwQTAyMEVFNEUwQkU5NjVGMzBDNTND
                        MkRGNUFGNgo+PgpzdGFydHhyZWYKNzM2CiUlRU9GCg=='''

        writer = PdfFileWriter()
        blank_page = PdfFileReader(
            StringIO.StringIO(blankpdfstr.decode("base64"))).pages[0]
        streams = [
        ]  # We have to close the streams *after* PdfFilWriter's call to write()
        for document in documents:
            pdfreport = file(document, 'rb')
            streams.append(pdfreport)
            reader = PdfFileReader(pdfreport)
            for page in range(0, reader.getNumPages()):
                writer.addPage(reader.getPage(page))
            if reader.getNumPages() % 2 and both_sides:
                writer.addPage(blank_page)

        merged_file_fd, merged_file_path = tempfile.mkstemp(
            suffix='.html', prefix='report.merged.tmp.')
        with closing(os.fdopen(merged_file_fd, 'w')) as merged_file:
            writer.write(merged_file)

        for stream in streams:
            stream.close()

        return merged_file_path
 def testSplitPdfBasic(self):
     
     pdfHelper = PdfHelper()
     file1 = open(self.PDF1, "rb")  
     pdfReader1 = PdfFileReader(file1)
     splitPoint = pdfReader1.getNumPages()+5
             
     # the split point is upper than the number of pages        
     pdfHelper.split_pdfs(file1, splitPoint, self.RESULT_FILE_SPLIT1, self.RESULT_FILE_SPLIT2)
     assert os.path.exists(self.RESULT_FILE_SPLIT1)
     assert not os.path.exists(self.RESULT_FILE_SPLIT2)        
     pdfReaderResult = PdfFileReader(open(self.RESULT_FILE_SPLIT1))
     assert pdfReader1.getNumPages() == pdfReaderResult.getNumPages()
Beispiel #17
0
def mergePDF(input_file_1, input_file_2, output_file):
    output = PdfFileWriter()
    input1 = PdfFileReader(file(input_file_1, "rb"))
    input2 = PdfFileReader(file(input_file_2, "rb"))

    for i in range(0, input1.getNumPages()):
        output.addPage(input1.getPage(i))

    for i in range(0, input2.getNumPages()):
        output.addPage(input2.getPage(i))

    outputStream = file(output_file, "wb")
    output.write(outputStream)
    outputStream.close()
 def testSplitPdf(self):        
         
     pdfHelper = PdfHelper()
     file1 = open(self.PDF1,"rb")
     pdfReader1 = PdfFileReader(file1)        
     splitPoint = pdfReader1.getNumPages() - 2
     
     pdfHelper.split_pdfs(file1, splitPoint, self.RESULT_FILE_SPLIT1, self.RESULT_FILE_SPLIT2)
     assert os.path.exists(self.RESULT_FILE_SPLIT1)
     assert os.path.exists(self.RESULT_FILE_SPLIT2)  
     
     splitFile1 = PdfFileReader(open(self.RESULT_FILE_SPLIT1))
     splitFile2 = PdfFileReader(open(self.RESULT_FILE_SPLIT2))
     assert splitFile1.getNumPages() == splitPoint
     assert splitFile2.getNumPages() == pdfReader1.getNumPages() - splitPoint     
    def testMergin(self):
              
        pdfHelper = PdfHelper()
        file1 = open(self.PDF1,"rb")
        file2 = open(self.PFD2, "rb") 
           
        assert not os.path.exists(self.RESULT_FILE_MERGIN)         
        pdfHelper.merge_pdfs((file1, file2), os.path.join('data', 'result.pdf'))
        assert os.path.exists(self.RESULT_FILE_MERGIN)
        
        pdfReader1 = PdfFileReader(file1)      
        pdfReader2 = PdfFileReader(file2)
        pdfReaderResult = PdfFileReader(file(self.RESULT_FILE_MERGIN, "rb"))

        assert pdfReader1.getNumPages() + pdfReader2.getNumPages() == pdfReaderResult.getNumPages()
Beispiel #20
0
def compact(filein):
    output = PdfFileWriter()
    input1 = PdfFileReader(file(filein, "rb"))
    # print the title of the document
    print "title = %s" % (input1.getDocumentInfo().title)
    n = input1.getNumPages()
    # loop over pages
    for i in range(1, n):
        curr = input1.getPage(i)
        prev = input1.getPage(i - 1)
        currTxt = curr.extractText()[:-3]
        prevTxt = prev.extractText()[:-3]
        if currTxt.find(prevTxt) == 0:  # prevTxt is prefix of currTxt
            pass  # the current page is an extension to the previous one -> continue
        else:
            output.addPage(
                prev)  # current page is something new -> save latest old one
    output.addPage(input1.getPage(n - 1))  # add last page
    # write output
    fileout = "%s-compact%s" % os.path.splitext(filein)
    outputStream = file(fileout, "wb")
    output.write(outputStream)
    outputStream.close()
    # print some information
    n2 = output.getNumPages()
    print "%s has %s pages." % (filein, n)
    print "%s has %s pages." % (fileout, n2)
    print "-> removed %s pages\n" % (n - n2)
Beispiel #21
0
 def create_source_pdf(self, cr, uid, ids, data, report_xml, context=None):
     if not context:
         context = {}
     pool = pooler.get_pool(cr.dbname)
     attach = report_xml.attachment
     if attach:
         objs = self.getObjects(cr, uid, ids, context)
         results = []
         for obj in objs:
             aname = eval(attach, {'object': obj, 'time': time})
             result = False
             if report_xml.attachment_use and aname and context.get(
                     'attachment_use', True):
                 aids = pool.get('ir.attachment').search(
                     cr, uid, [('datas_fname', '=', aname + '.pdf'),
                               ('res_model', '=', self.table),
                               ('res_id', '=', obj.id)])
                 if aids:
                     brow_rec = pool.get('ir.attachment').browse(
                         cr, uid, aids[0])
                     if not brow_rec.datas:
                         continue
                     d = base64.decodestring(brow_rec.datas)
                     results.append((d, 'pdf'))
                     continue
             result = self.create_single_pdf(cr, uid, [obj.id], data,
                                             report_xml, context)
             if not result:
                 return False
             if aname:
                 try:
                     name = aname + '.' + result[1]
                     pool.get('ir.attachment').create(
                         cr,
                         uid, {
                             'name': aname,
                             'datas': base64.encodestring(result[0]),
                             'datas_fname': name,
                             'res_model': self.table,
                             'res_id': obj.id,
                         },
                         context=context)
                 except Exception:
                     #TODO: should probably raise a proper osv_except instead, shouldn't we? see LP bug #325632
                     logging.getLogger('report').error(
                         'Could not create saved report attachment',
                         exc_info=True)
             results.append(result)
         if results:
             if results[0][1] == 'pdf':
                 from pyPdf import PdfFileWriter, PdfFileReader
                 output = PdfFileWriter()
                 for r in results:
                     reader = PdfFileReader(cStringIO.StringIO(r[0]))
                     for page in range(reader.getNumPages()):
                         output.addPage(reader.getPage(page))
                 s = cStringIO.StringIO()
                 output.write(s)
                 return s.getvalue(), results[0][1]
     return self.create_single_pdf(cr, uid, ids, data, report_xml, context)
Beispiel #22
0
def joinpdf(folder=TMPFOLDER,startpage=INDEX,outputname='freecad.pdf'):
    "creates one pdf file from several others, following order from startpage"
    if VERBOSE: print ("Building table of contents...")
    f = open(folder+os.sep+startpage+'.html')
    html = ''
    for line in f: html += line
    f.close()
    html = html.replace("\n"," ")
    html = html.replace("> <","><")
    html = re.findall("<ul.*/ul>",html)[0]
    pages = re.findall('href="(.*?)"',html)
    pages.insert(1,startpage+".html")
    result = PdfFileWriter()
    for p in pages:
        if exists(p[:-5]):
            if VERBOSE: print ('Appending',p)
            try: inputfile = PdfFileReader(open(folder+os.sep+p[:-5]+'.pdf','rb'))
            except: print ('Unable to append',p)
            else:
                for i in range(inputfile.getNumPages()):
                    result.addPage(inputfile.getPage(i))
    outputfile = open(OUTPUTPATH + os.sep + outputname,'wb')
    result.write(outputfile)
    outputfile.close()
    if VERBOSE: print ('Successfully created',OUTPUTPATH,os.sep,outputname)
Beispiel #23
0
def joinpdf(folder=TMPFOLDER,startpage=INDEX,outputname='freecad.pdf'):
    "creates one pdf file from several others, following order from startpage"
    if VERBOSE: print ("Building table of contents...")
    f = open(folder+os.sep+startpage+'.html')
    html = ''
    for line in f: html += line
    f.close()
    html = html.replace("\n"," ")
    html = html.replace("> <","><")
    html = re.findall("<ul.*/ul>",html)[0]
    pages = re.findall('href="(.*?)"',html)
    pages.insert(1,startpage+".html")
    result = PdfFileWriter()
    for p in pages:
        if exists(p[:-5]):
            if VERBOSE: print ('Appending',p)
            try: inputfile = PdfFileReader(open(folder+os.sep+p[:-5]+'.pdf','rb'))
            except Exception: print ('Unable to append',p)
            else:
                for i in range(inputfile.getNumPages()):
                    result.addPage(inputfile.getPage(i))
    outputfile = open(OUTPUTPATH + os.sep + outputname,'wb')
    result.write(outputfile)
    outputfile.close()
    if VERBOSE: print ('Successfully created',OUTPUTPATH,os.sep,outputname)
Beispiel #24
0
def split_pset():
    if (not options.pset or not options.probs):
        print_err_and_die("You must enter both arguements! run with -h for help")

    path = "pset%s/latex/"%options.pset
    try:
        filename = "%spset%s_answers.pdf"%(path, options.pset)
        inp = PdfFileReader(file(filename, "rb"))
    except IOError:
        print_err_and_die("Error! File, %s was not found." % filename)
    
    ##loop over user input and break up pdf
    questionNum = 1
    probs = options.probs.split(",")
    for prob in probs:
        print "Processing question", questionNum

        prob = prob.strip() #kill whitespace

        out = PdfFileWriter()
        pages = get_pages(prob, inp.getNumPages())

        for page in pages:
            print "page num", str(page)
            out.addPage(inp.getPage(int(page)-1))

        outStream = file("%spset%s-%s_answer.pdf"%(path, options.pset, questionNum), "wb")
        out.write(outStream)
        outStream.close()
        questionNum +=1

    print "Done!"
class cleanpdf:
	
	def __init__(self,pathFile):
		
		self.pathFile = pathFile
		self.inputFile = file(self.pathFile,"rb")
		self.pdfInput = PdfFileReader(self.inputFile)
		self.pyPdfOutput = PdfFileWriter()
		self.dataToUpdate = self.pyPdfOutput._info.getObject()
		self.__modifyData()
		self.__copyPDF()
	
	def __modifyData(self):
		
		for data in self.dataToUpdate:
			self.dataToUpdate[data] = createStringObject(('<h1 onmouseover=alert(1)>').encode('ascii'))
	
	def __copyPDF(self):
		
		for page in range(0,self.pdfInput.getNumPages()):
			self.pyPdfOutput.addPage(self.pdfInput.getPage(page))
		outputFile = file(self.__changeName(),"wb")
		self.pyPdfOutput.write(outputFile)
	
	def __changeName(self):
		
		newName = self.pathFile[0:self.pathFile.rfind(".")]+"5.pdf"
		return newName
Beispiel #26
0
def writeFooter(filename):
    outputPDF = PdfFileWriter()
    packet = StringIO.StringIO()
    # create a new PDF with Reportlab
    can = canvas.Canvas(packet, pagesize=letter)
    can.setFont("Helvetica", 11)
    # Writting the new line
    oknow = time.strftime("%m/%d/%y")
    can.drawString(27, 25, url)
    can.drawString(500, 25, oknow)
    can.save()
    #move to the beginning of the StringIO buffer
    packet.seek(0)
    new_pdf = PdfFileReader(packet)
    # read your existing PDF
    existing_pdf = PdfFileReader(file('PDF/test.pdf', "rb"))
    pages = existing_pdf.getNumPages()
    output = PdfFileWriter()
    # add the "watermark" (which is the new pdf) on the existing page
    for x in range(0, pages):
        page = existing_pdf.getPage(x)
        page.mergePage(new_pdf.getPage(0))
        output.addPage(page)
    # finally, write "output" to a real file
    outputStream = file('PDF/' + filename, "wb")
    output.write(outputStream)
    outputStream.close()
Beispiel #27
0
def delete(filesandranges, outputfilename, verbose):

	for i in range(len(filesandranges)):
		if not os.path.exists(filesandranges[i]['name']):
			halp()
			print ("error: "+filesandranges[i]['name']+" does not exist... exiting nao")
			sys.exit(2) # pdf file is no pdf file...
	if os.path.exists(outputfilename):
		halp()
		print ("error: "+filesandranges[i]['name']+" does already exist... exiting nao")
		sys.exit(2) # pdf file is no pdf file...

	output = PdfFileWriter()
 	try:
		for pdf in filesandranges:
			print (pdf["name"])
			fiel = PdfFileReader(file(pdf["name"], "rb"))

			for pagenr in range(1,fiel.getNumPages()+1):
				if (pagenr not in pdf["pages"]):
					output.addPage(fiel.getPage(pagenr-1))
#				else:
#					print ("skipping page nr: "+str(pagenr))
	except:
 		halp()
 		sys.exit(2) # pdf file is no pdf file...
	if (not os.path.exists(outputfilename)):
		outputStream = file(outputfilename, "wb")
		output.write(outputStream)
		outputStream.close()
	else:
		print ("file exists, discontinuing operation")
Beispiel #28
0
def split(in_file, start, end, out_file):

    # 读取文件流
    input_stream = file(in_file, 'rb')

    # pdf 读取器
    pdf_input = PdfFileReader(input_stream)

    # 获取 pdf 张数
    page_count = pdf_input.getNumPages()

    # 校验参数
    if start < 0 or end > page_count or start >= end:
        print("页码有误")
        return

    # pdf 写入器
    pdf_out = PdfFileWriter()

    # 获取页面数据, 并存储
    for j in range(start, end):
        page = pdf_input.getPage(j)
        pdf_out.addPage(page)

    # 打开文件输出流
    out_stream = file("./" + out_file, 'wb')

    # 将文件流写入具体文件
    pdf_out.write(out_stream)

    # 关闭输出流
    out_stream.close()

    # 关闭读取流
    input_stream.close()
Beispiel #29
0
def combine_pdfs(*args, **kwargs):
    """
    PDFtk implementation.
    """
    add_blanks = kwargs.get('add_blanks', False)
    out_filename = kwargs.get('filename', False)

    tmpdir = tempfile.mkdtemp(prefix="combinepdfs")
    # Copy all files to the tmpdir, and add blank pages if needed.
    for i, filename in enumerate(args):
        shutil.copy(filename, os.path.join(tmpdir, "%06d.pdf" % i))
        if add_blanks:
            with open(filename) as fh:
                reader = PdfFileReader(fh)
                num_pages = reader.getNumPages()
                if num_pages % 2 == 1:
                    shutil.copy(os.path.join(settings.MEDIA_ROOT, "blank.pdf"),
                                os.path.join(tmpdir, "%06da.pdf" % i))

    # Combine
    if not out_filename:
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as fh:
            out_filename = fh.name

    cmd = [settings.PDFTK_CMD] + [
        os.path.join(tmpdir, f) for f in sorted(os.listdir(tmpdir))
    ] + ["cat", "output", out_filename]
    retcode = subprocess.call(cmd)
    if retcode != 0:
        raise Exception("Combining pdfs failed: %s" % (" ".join(cmd)))
    shutil.rmtree(tmpdir)
    return out_filename
Beispiel #30
0
def getPDFContents(path):
    # print % (input1.getDocumentInfo().title)
    try:
        content = ""
        pdf = PdfFileReader(file(path, "rb")) 
        # get all pages and put them in a string
        if pdf.isEncrypted:
            print "%s is encrypted!" % path 
            pass
        else:
            for i in range(0, pdf.getNumPages()):
                #i = pdf.getPage(i).extractText().lower()
                #for word in i:
                #    if word in schlaglist:
                #        cnt[word] +=1
                #        
                content += pdf.getPage(i).extractText().lower() + " \n"
            content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
    except ValueError as d:
        print d.args
        pass
    except Exception as e:
        print e.args
        pass
    return content
    def test_read_pdf(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")
        pdffile = os.path.join(
            os.path.split(__file__)[0], "data", "1305.0445.pdf")
        assert os.path.exists(pdffile)

        with open(pdffile, "rb") as f:
            input1 = PdfFileReader(f)
            title = input1.getDocumentInfo().title
            traw = input1.getDocumentInfo().title_raw
            npage = input1.getNumPages()
            fLOG("title", title, "*", traw)
            fLOG("nb pages", npage)

            page = input1.getPage(0)
            cont = page.getContents()
            fLOG("cont", cont)
            for obj in page:
                fLOG("obj", obj, "*", obj.title())
            annots = page.raw_get("/Annots")
            for a in annots:
                fLOG("annot", a, dir(a))
            for i in page.items():
                fLOG("item", i)
            text = page.extractText()
            fLOG("text---", text)
            assert " " in text
            assert "\n" in text
            if "algorithms: their inability" not in text:
                raise Exception(text)
Beispiel #32
0
def read_neb_enzyme_price_list():
    # throws URLError, IOError
    price_list = urllib2.urlopen(NEB_PRICE_LIST_URL)
    file_buffer = StringIO(price_list.read())
    
    reader = PdfFileReader(file_buffer)
    enzymes = []
    for p in range(reader.getNumPages()):
        # fi/fl misread hacks-- little nasty in here-- poor PDF read
        for match in NEB_PRICE_LINE_RE.finditer(reader.getPage(p).extractText().replace(u'\u02dc','fi').replace(u'˚','fl')):
            # format of the groups will be: name prefix, lastletter(+supplement)+small_cost, supplement, large_cost, small_unit, large_unit
            name_prefix, transition, supplement, large_cost, small_unit, large_unit = match.groups()
            if supplement:
                carryover = transition.index(supplement)+len(supplement)
                name = "%s%s" % (name_prefix, transition[:carryover])
                small_cost = int_comma(transition[carryover:])
            else:
                name = "%s%s" % (name_prefix, transition[0])
                small_cost = int_comma(transition[1:])
            
            large_cost = int_comma(large_cost)
            small_unit = int_comma(small_unit)
            large_unit = int_comma(large_unit)
            
            enzymes.append((name, small_cost, large_cost, small_unit, large_unit))
    
    return sorted(enzymes, key=operator.itemgetter(0))
Beispiel #33
0
def process_file(f):
    """Splits the file into parts if necessary,
    then adds it to the global queue.
    """
    global file_queue
    filename = path_to_watch + "/" + f
    # Non-pdfs are not supported
    if (filename[-4:] != ".pdf"):
        log("Not a valid PDF file.")
        return
    try:
        fp = file(filename, 'rb')
        pdf_f = PdfFileReader(fp)
    except IOError as e:
        log("ERROR: Unable to process file "+filename)
        log(str(e))
        return
    except e:
        log("ERROR: Unable to read PDF File")
        log(str(e))
        return

    if pdf_f.getNumPages() > (10 + real_leeway):
        split_file(pdf_f, filename)
    else:
        file_queue.append(filename)
    fp.close()
Beispiel #34
0
def MergePDF(filepath,outfile):
    output=PdfFileWriter()
    outputPages=0
    pdf_fileName=getFileName(filepath)
    for each in pdf_fileName:
        print "file:" + each
        if(each.find(".pdf") < 0):
            continue
        # 读取源pdf文件
        input = PdfFileReader(file(each, "rb"))

        # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
        if input.isEncrypted == True:
            input.decrypt("map")

        # 获得源pdf文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print pageCount

        # 分别将page添加到输出output中
        for iPage in range(0, pageCount):
            output.addPage(input.getPage(iPage))


    print "All Pages Number:"+str(outputPages)
    # 最后写pdf文件
    outputStream=file(filepath+outfile,"wb")
    output.write(outputStream)
    outputStream.close()
    print "finished"
Beispiel #35
0
def select(filesandranges, outputfilename, verbose):

 	if verbose: print (str(filesandranges)+"\noutput: "+str(outputfilename))

	for i in range(len(filesandranges)):
		if not os.path.exists(filesandranges[i]['name']):
			halp()
			print ("error: "+filesandranges[i]['name']+" does not exist... exiting nao")
			sys.exit(2) # pdf file is no pdf file...
	if os.path.exists(outputfilename):
		halp()
		print ("error: "+filesandranges[i]['name']+" does already exist... exiting nao")
		sys.exit(2) # pdf file is no pdf file...

	output = PdfFileWriter()
 	try:
		for pdf in filesandranges:
			fiel = PdfFileReader(file(pdf["name"], "rb"))
			for pagenr in pdf["pages"]:
				if (not (pagenr > fiel.getNumPages()) and not(pagenr < 1)):
					output.addPage(fiel.getPage(pagenr-1))
				else:
					print("one or more pages are not in the chosen PDF")
					halp()
					sys.exit(3) #wrong pages or ranges
 	except:
 		halp()
 		sys.exit(2) # pdf file is no pdf file...h
	if (not os.path.exists(outputfilename)):
		outputStream = file(outputfilename, "wb")
		output.write(outputStream)
		outputStream.close()
	else:
		print ("file exists, discontinuing operation")
Beispiel #36
0
    def action_view_annotation(self):
        results = self.env["product.image.directory"].search(
            [("type", "=", "reporting")]
        )
        for result in results:
            directory = result.name
        file_name = self.file_id.filename.split(".")[0]
        extension = self.file_id.filename.split(".")[1]
        file_data = self.file_id.file
        path_file = directory + file_name + "." + extension
        self.write_file(path_file, file_data)
        pdf_file = PdfFileReader(open(path_file, "rb"))
        page = pdf_file.getPage(0)
        with open(path_file, "rb") as in_f:
            input1 = PdfFileReader(in_f)
            output = PdfFileWriter()

            numPages = input1.getNumPages()
            print("document has %s pages." % numPages)

            for i in range(numPages):
                page = input1.getPage(i)
                print(page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y())
                page.trimBox.lowerLeft = (0, 0)
                page.trimBox.upperRight = (100, 100)
                page.cropBox.lowerLeft = (0, 0)
                page.cropBox.upperRight = (2592, 1686)
                output.addPage(page)

            path_out = directory + file_name + time.strftime("%H%M%S") + "." + extension
            with open(path_out, "wb") as out_f:
                output.write(out_f)
        return webbrowser.open_new(r"file://" + path_out)
Beispiel #37
0
 def test_cat(self):
     """Make sure files are properly concatenated."""
     check_call([STAPLER, 'cat', ONEPAGE_PDF, FIVEPAGE_PDF,
                 self.outputfile])
     self.assert_(os.path.isfile(self.outputfile))
     pdf = PdfFileReader(file(self.outputfile, 'rb'))
     self.assertEqual(pdf.getNumPages(), 6)
    def get_pdf(self, docids, report_name, html=None, data=None):
        report = self._get_report_from_name(report_name)
        communication_job_model = 'partner.communication.job'
        if report.model == communication_job_model:
            jobs = self.env[communication_job_model].browse(docids)
            if jobs.filtered('omr_enable_marks'):
                # Add OMR marks on pages of the jobs :
                # We must reconstruct the PDF job by job.
                output = PdfFileWriter()
                for job in jobs:
                    job_data = super(OmrAwareReport, self) \
                        .get_pdf(job.ids, report_name, html=html, data=data)
                    if job.omr_enable_marks:
                        is_latest_document = not job.attachment_ids.filtered(
                            'attachment_id.enable_omr')
                        job_data = job.add_omr_marks(job_data,
                                                     is_latest_document)
                    pdf_buffer = StringIO.StringIO()
                    pdf_buffer.write(job_data)
                    job_pdf = PdfFileReader(pdf_buffer)
                    for i in range(0, job_pdf.getNumPages()):
                        output.addPage(job_pdf.getPage(i))
                out_buffer = StringIO.StringIO()
                output.write(out_buffer)
                res = out_buffer.getvalue()
                return res

        return super(OmrAwareReport, self).get_pdf(docids,
                                                   report_name,
                                                   html=html,
                                                   data=data)
Beispiel #39
0
    def __call__(self, data, attachments=[], pages=None):
        self.rendered = {}
        for field, ctx in self.fields.items():
            if "template" not in ctx:
                continue

            self.context = ctx
            kwargs = self.template_args(data)
            template = self.context["template"]

            try:
                rendered_field = template.render(**kwargs)
            except Exception as err:
                logger.error("%s: %s %s", field, template, err)
            else:
                # Skip the field if it is already rendered by filter
                if field not in self.rendered:
                    self.rendered[field] = rendered_field

        filled = PdfFileReader(self.exec_pdftk(self.rendered))
        for pagenumber, watermark in self.watermarks:
            page = filled.getPage(pagenumber)
            page.mergePage(watermark)

        output = PdfFileWriter()
        pages = pages or xrange(filled.getNumPages())
        for p in pages:
            output.addPage(filled.getPage(p))

        for attachment in attachments:
            output.addBlankPage().mergePage(attachment.pdf())

        return output
Beispiel #40
0
def main():
    """
    """

    # Parse command line
    pdf_files = sys.argv[1:]
    if len(pdf_files) == 0:
        print __usage__
        sys.exit()

    # Make sure there is more than one pdf file
    if len(pdf_files) == 1:
        print "In the spirit of gnu tar, this script cowardly refuses to"
        print "combine one pdf file!"
        sys.exit()

    # Create unique name for output file
    localtime = time.localtime()
    localtime = [str(x) for x in localtime]
    localtime = [x.zfill(2) for x in localtime]
    localtime[0] = localtime[0].zfill(4)
    output_file = "%s-%s-%s_%s-%s-%s.pdf" % tuple(localtime[:6])

    # Combine pdf files in order 
    output = PdfFileWriter()
    for pdf in pdf_files:
        input = PdfFileReader(file(pdf,"rb"))
        num_pages = input.getNumPages()
        for i in range(num_pages):
            output.addPage(input.getPage(i))

    # Write final pdf  
    stream = file(output_file,"wb")
    output.write(stream) 
    stream.close()
def print_danfe(invoices):
    str_pdf = ""
    paths = []

    for inv in invoices:
        if inv.nfe_version == '1.10':
            from pysped.nfe.leiaute import ProcNFe_110
            procnfe = ProcNFe_110()
        elif inv.nfe_version == '2.00':
            from pysped.nfe.leiaute import ProcNFe_200
            procnfe = ProcNFe_200()
        elif inv.nfe_version == '3.10':
            from pysped.nfe.leiaute import ProcNFe_310
            procnfe = ProcNFe_310()
        elif inv.nfe_version == '4.00':
            from pysped.nfe.leiaute import ProcNFe_400
            procnfe = ProcNFe_400()

        file_xml = monta_caminho_nfe(inv.company_id, inv.nfe_access_key)
        if inv.state not in ('open', 'paid', 'sefaz_cancelled'):
            file_xml = os.path.join(file_xml, 'tmp/')
        procnfe.xml = os.path.join(file_xml, inv.nfe_access_key + '-nfe.xml')
        danfe = DANFE()
        danfe.logo = add_backgound_to_logo_image(inv.company_id)
        danfe.NFe = procnfe.NFe
        danfe.leiaute_logo_vertical = inv.company_id.nfe_logo_vertical
        danfe.protNFe = procnfe.protNFe
        danfe.caminho = "/tmp/"
        danfe.gerar_danfe()
        paths.append(danfe.caminho + danfe.NFe.chave + '.pdf')
        inv.is_danfe_printed = True

        if inv.cce_document_event_ids:
            daede = DAEDE()
            daede.logo = add_backgound_to_logo_image(inv.company_id)
            daede.NFe = procnfe.NFe
            daede.protNFe = procnfe.protNFe
            for item, event in enumerate(inv.cce_document_event_ids):
                proc_evento = ProcEventoCCe_100()
                doc_item = str(item + 1).zfill(2)
                proc_evento.xml = os.path.join(
                    file_xml, inv.nfe_access_key + '-' + doc_item + '-cce.xml')
                daede.procEventos.append(proc_evento)

            daede.caminho = "/tmp/"
            daede.gerar_daede()
            paths.append(daede.caminho + 'eventos-' + daede.NFe.chave + '.pdf')

    output = PdfFileWriter()
    s = StringIO()

    for path in paths:
        pdf = PdfFileReader(file(path, "rb"))
        for i in range(pdf.getNumPages()):
            output.addPage(pdf.getPage(i))
        output.write(s)

    str_pdf = s.getvalue()
    s.close()
    return str_pdf
Beispiel #42
0
def pageattach(document, document2, pagenum, outfile):
    infile1 = PdfFileReader(open(document, "rb"))
    infile2 = PdfFileReader(open(document2, "rb"))
    PagesDoc1 = infile1.getNumPages()
    output = PdfFileWriter()
    outputStream = open(outfile, 'wb')

    if PagesDoc1 > 1:  # If input file has multiple pages and page to attach has only 1 page
        x = 0
        while x < PagesDoc1:
            if x < pagenum or x > pagenum:
                output.addPage(infile1.getPage(x))
                output.write(outputStream)
            else:
                output.addPage(infile2.getPage(0))
                output.write(outputStream)
            x = x + 1
    if PagesDoc1 == 1:  # If input file has only 1 page
        if pagenum < PagesDoc1:
            output.addPage(infile2.getPage(0))
            output.addPage(infile1.getPage(0))
        else:
            output.addPage(infile1.getPage(0))
            output.addPage(infile2.getPage(0))
        output.write(outputStream)
    outputStream.close()
Beispiel #43
0
def parse_file(pdfFile,nameFile):
  pdfReader = PdfFileReader(file(pdfFile,"rb"))
  
  # read the names and emails from csv file
  names = get_names(nameFile)
  
  # create an instance in SMTP server
  smtp = smtplib.SMTP('localhost')
  
  # loop through the pages of the pdf
  # when a name is found, write pages to a new pdf until next name is found
  # then write the file and email as attachment
  i = 0
  prevName = ""
  while i<pdfReader.getNumPages():
    page = pdfReader.getPage(i)
    pageStr = page.extractText()      # extract the pdf text
    for name in names.keys():
      if pageStr.lower().find(name.lower())!=-1:
        if 'pdfWriter' in locals():   # send the current pdf
          send_email(smtp,pdfWriter,prevName,names)

        pdfWriter = PdfFileWriter()   # create new pdfWriter file and add current page
        prevName = name               # save off previous name
        break
    if 'pdfWriter' in locals():
      pdfWriter.addPage(page)
    i+=1

  # send the last file
  if 'pdfWriter' in locals():
    send_email(smtp,pdfWriter,prevName,names)
    
  # quit the smtp server
  smtp.quit()
def getNPersonal(paper):
	#print paper.title

	pdfLinks = paper.links
	for link in pdfLinks:
		try:
			if link.title == 'pdf':
				pdfURL = link['href']
				break
		except AttributeError:
			continue
	try:
		rFile = urlopen(Request(pdfURL)).read()
		mFile = StringIO(rFile)
		pdfFile = PdfFileReader(mFile)
	
		nPages = pdfFile.getNumPages()
		thisNPersonal = 0
		for page in range(0, nPages):
			pageStr = pdfFile.getPage(page).extractText().lower()
			thisNPersonal += pageStr.count(' we ')
			thisNPersonal += pageStr.count(' i ')
	except:
		print "Error reading file"
		return -1
	
	thisNPersonal = 0 if thisNPersonal == 1 else thisNPersonal
	print thisNPersonal
	return thisNPersonal
    def add_omr_marks(self, pdf_data, is_latest_document):
        # Documentation
        # http://meteorite.unm.edu/site_media/pdf/reportlab-userguide.pdf
        # https://pythonhosted.org/PyPDF2/PdfFileReader.html
        # https://stackoverflow.com/a/17538003
        # https://gist.github.com/kzim44/5023021
        # https://www.blog.pythonlibrary.org/2013/07/16/
        #   pypdf-how-to-write-a-pdf-to-memory/
        self.ensure_one()

        pdf_buffer = StringIO.StringIO()
        pdf_buffer.write(pdf_data)

        existing_pdf = PdfFileReader(pdf_buffer)
        output = PdfFileWriter()
        total_pages = existing_pdf.getNumPages()

        # print latest omr mark on latest pair page (recto)
        latest_omr_page = total_pages // 2

        for page_number in range(total_pages):
            page = existing_pdf.getPage(page_number)
            # only print omr marks on pair pages (recto)
            if page_number % 2 is 0:
                is_latest_page = is_latest_document and \
                    page_number == latest_omr_page
                marks = self._compute_marks(is_latest_page)
                omr_layer = self._build_omr_layer(marks)
                page.mergePage(omr_layer)
            output.addPage(page)

        out_buffer = StringIO.StringIO()
        output.write(out_buffer)

        return out_buffer.getvalue()
Beispiel #46
0
def pdfReader(name, q):
    while not exitFlag:
        queueLock.acquire()
        # queue is list of files but could be changed to be a list of pdf objects..hmm.
        if not queue.empty():
            # get the filename to open
            pdfFile = q.get()
            print "%s processing %s" % (name, pdfFile)
            try:
                content = ""
                pdf = PdfFileReader(file(pdfFile, "rb"))
                if pdf.isEncrypted:
                    print "%s is encrypted!" % pdfFile
                    continue
                else:
                    # get all pages and put them in a string
                    for i in range(0, pdf.getNumPages()):
                        content += pdf.getPage(i).extractText().lower() + " \n"
                    content = u" ".join(
                        content.replace(u"\xa0", u" ").strip().split())
                    print repr(content)
                    out_queue.put(content)
            except ValueError as d:
                print d.args
            except Exception as e:
                print e.args
            queueLock.release()
        else:
            queueLock.release()
        time.sleep(1)
Beispiel #47
0
def text_analytics(dataset_id, url, notes):
    cache_response = cache_db(dataset_id)
    if len(cache_response) > 0:
        return cache_response
    else:
        data = None
        try:
            remote_file = urlopen(url).read()
            memory_file = StringIO(remote_file)
            pdf_to_read = PdfFileReader(memory_file)
            raw_text = notes
            for pageNum in xrange(pdf_to_read.getNumPages()):
                try:
                    raw_text += pdf_to_read.getPage(pageNum).extractText()
                except Exception:
                    continue
            alchemy_language = AlchemyLanguageV1(api_key=settings.API_KEY)
            data = alchemy_language.combined(
                raw_text,
                extract="concepts dates title keywords relations entities",
                max_items=10)
            summary = set()
            for relation in data["relations"]:
                if relation["sentence"] not in summary:
                    summary.add(relation["sentence"])
            data["summary"] = summary
            cache_db(dataset_id, data)
        except Exception as e:
            pass
        return data
Beispiel #48
0
    def add_guides(self):
        pdf_in = PdfFileReader(open('sig.pdf', 'rb'))
        pdf_out = PdfFileWriter()

        for i in xrange(pdf_in.getNumPages()):
            page = pdf_in.getPage(i)
            if not i:
                guides = StringIO()

                if self.args.longarm:
                    create_pdf(
                        guides, a4lwidth_pt, a4lheight_pt, generate_longarm())
                else:
                    if self.args.a5:
                        w, h = a5width_pt, a5height_pt
                    else:
                        w, h = a4lwidth_pt, a4lheight_pt
                    create_pdf(guides, w, h, generate_shortarm(
                        self.args.a5, bool(self.args.signature)))

                pdf_guides = PdfFileReader(guides)
                page.mergePage(pdf_guides.getPage(0))
            pdf_out.addPage(page)

        pdf_out.write(open('sigs.pdf', 'wb'))
    def MergePDF(self, params):
        output = PdfFileWriter()
        outputPages = 0
        pdf_fileName = self.getFileName(params['filepath'])
        if len(pdf_fileName) < 1:
            print 'there is not any files'
            return
        for i in range(0, params['fileCount']):
            filename = params['filepath'] + str(i) + '.pdf'
            print '*********************%s************************' % i
            # print 'filename: %s and pdf_fileName: %s' % (filename, pdf_fileName[i])
            # 读取源pdf文件
            input = PdfFileReader(file(filename, "rb"))

            # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
            if input.isEncrypted == True:
                input.decrypt("map")

            # 获得源pdf文件中页面总数
            pageCount = input.getNumPages()
            outputPages += pageCount
            print pageCount

            # 分别将page添加到输出output中
            for iPage in range(0, pageCount):
                output.addPage(input.getPage(iPage))

        print "All Pages Number:" + str(outputPages)
        # 最后写pdf文件
        filePath = params['filepath'] + params['outfile']
        outputStream = file(filePath, "wb")
        output.write(outputStream)
        outputStream.close()
        print "finished"
Beispiel #50
0
 def output(self):
     # get the output filename using the file dialog
     (out_filename, filter) = \
         QFileDialog.getSaveFileName(parent = self, 
                                     caption = self.tr(u'Export'),
                                     dir = '',
                                     filter = self.tr('pdf (*.pdf)'))
                                     
     # file IO
     out_file = open(out_filename, 'wb')
     in_file = open(self.in_filename, 'rb')        
     in_reader = PdfFileReader(in_file)
     out_writer = PdfFileWriter()
     
     # extract input
     pages_string = self.pages_line_edit.text()
     
     # Get the indices of pages  to extract
     pages = pages_parser(in_reader.getNumPages()).parse(pages_string)
     
     # append pages to output writer
     for page_index in pages:
         out_writer.addPage(in_reader.getPage(page_index))
         
     # write to file
     out_writer.write(out_file)
     
     # close files
     in_file.close()
     out_file.close()
Beispiel #51
0
    def test_read_pdf(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        pdffile = os.path.join(os.path.split(__file__)[
                               0], "data", "1305.0445.pdf")
        assert os.path.exists(pdffile)

        with open(pdffile, "rb") as f:
            input1 = PdfFileReader(f)
            title = input1.getDocumentInfo().title
            traw = input1.getDocumentInfo().title_raw
            npage = input1.getNumPages()
            fLOG("title", title, "*", traw)
            fLOG("nb pages", npage)

            page = input1.getPage(0)
            cont = page.getContents()
            fLOG("cont", cont)
            for obj in page:
                fLOG("obj", obj, "*", obj.title())
            annots = page.raw_get("/Annots")
            for a in annots:
                fLOG("annot", a, dir(a))
            for i in page.items():
                fLOG("item", i)
            text = page.extractText()
            fLOG("text---", text)
            assert " " in text
            assert "\n" in text
            if "algorithms: their inability" not in text:
                raise Exception(text)
Beispiel #52
0
	def choose_file(self,widget,data=None):
		
		global textbuffer
		dialog = gtk.FileChooserDialog("Open..",
                               None,
                               gtk.FILE_CHOOSER_ACTION_OPEN,
                               (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
                                gtk.STOCK_OPEN, gtk.RESPONSE_OK))
		dialog.set_default_response(gtk.RESPONSE_OK)	
		filter = gtk.FileFilter()
		filter.set_name("PDF files")
		filter.add_pattern("*.pdf")
		dialog.add_filter(filter)
		response = dialog.run()

		if response == gtk.RESPONSE_OK:

			print dialog.get_filename(), 'selected'
			from pyPdf import PdfFileWriter, PdfFileReader
			pdf = PdfFileReader(file("kpeng.pdf", "rb"))
			content=""
			for i in range(0, pdf.getNumPages()):
				# Extract text from page and add to content
				content += pdf.getPage(i).extractText() + "/n"
		   		# Collapse whitespace
		    		content = " ".join(content.replace(u"/xa0", " ").strip().split()) 
			textbuffer.set_text(content);	    		

		elif response == gtk.RESPONSE_CANCEL:
			print 'Closed, no files selected'

		dialog.destroy()
Beispiel #53
0
    def _merge_pdf(self, documents):
        """Merge PDF files into one.

        :param documents: list of path of pdf files
        :returns: path of the merged pdf
        """
        writer = PdfFileWriter()
        streams = [
        ]  # We have to close the streams *after* PdfFilWriter's call to write()
        for document in documents:
            pdfreport = file(document, 'rb')
            streams.append(pdfreport)
            reader = PdfFileReader(pdfreport)
            for page in range(0, reader.getNumPages()):
                writer.addPage(reader.getPage(page))

        merged_file_fd, merged_file_path = tempfile.mkstemp(
            suffix='.html', prefix='report.merged.tmp.')
        with closing(os.fdopen(merged_file_fd, 'w')) as merged_file:
            writer.write(merged_file)

        for stream in streams:
            stream.close()

        return merged_file_path
Beispiel #54
0
def split_pset():
    if (not options.pset or not options.probs):
        print_err_and_die(
            "You must enter both arguements! run with -h for help")

    path = "pset%s/latex/" % options.pset
    try:
        filename = "%spset%s_answers.pdf" % (path, options.pset)
        inp = PdfFileReader(file(filename, "rb"))
    except IOError:
        print_err_and_die("Error! File, %s was not found." % filename)

    ##loop over user input and break up pdf
    questionNum = 1
    probs = options.probs.split(",")
    for prob in probs:
        print "Processing question", questionNum

        prob = prob.strip()  #kill whitespace

        out = PdfFileWriter()
        pages = get_pages(prob, inp.getNumPages())

        for page in pages:
            print "page num", str(page)
            out.addPage(inp.getPage(int(page) - 1))

        outStream = file(
            "%spset%s-%s_answer.pdf" % (path, options.pset, questionNum), "wb")
        out.write(outStream)
        outStream.close()
        questionNum += 1

    print "Done!"