Example #1
0
 def __init__(self, id: int):
     self.report_model = Report.objects.get(report_id=id)
     self.pdf_writer = PyPDF2.PdfFileWriter()
import PyPDF2, os
#it can only extract text from pdf and not other data
#and have limited set of actions

pdf1File = open('meetingminutes.pdf', 'rb')  #open in read binary
reader1 = PyPDF2.PdfFileReader(pdf1File)  #reading first pdf

pdf2File = open('meetingminutes2.pdf', 'rb')
reader2 = PyPDF2.PdfFileReader(pdf2File)  #reading second pdf

writer = PyPDF2.PdfFileWriter(
)  #writer variable for the pdf which is created by python only

for page_num in range(reader1.numPages):  #going through each page
    page = reader1.getPage(page_num)  #getting page
    writer.addPage(page)  #writing to new pdf

for page_num in range(reader2.numPages):  #taking second pdf's all pages
    page = reader2.getPage(page_num)  #getting each page
    writer.addPage(page)  #writing(appending) to the new(created) pdf

outputfile = open('CombinedPDFs.pdf', 'wb')  #write binary
writer.write(outputfile)
outputfile.close()
pdf1File.close()
pdf2File.close()
Example #3
0
 def __init__(self, in_file: str, out_file: str):
     self.__reader = PyPDF2.PdfFileReader(in_file, strict=False)
     self.__writer = PyPDF2.PdfFileWriter()
     self.__writer.cloneReaderDocumentRoot(self.__reader)
     self.out_file = out_file
Example #4
0
    def add_files_to_pdf(
        self,
        files: list = None,
        target_document: str = None,
    ) -> None:
        """Add images and/or pdfs to new PDF document

        Image formats supported are JPEG, PNG and GIF.

        The file can be added with extra properties by
        denoting `:` at the end of the filename.add()

        Supported extra properties are:

        - for PDF, the page numbers to be included in the new PDF document
        - for images, the position of the image in the new PDF document

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Add files to pdf
                ${files}=    Create List
                ...    ${TESTDATA_DIR}${/}invoice.pdf
                ...    ${TESTDATA_DIR}${/}approved.png:center
                ...    ${TESTDATA_DIR}${/}robot.pdf:1
                ...    ${TESTDATA_DIR}${/}approved.png:x=0,y=0
                ...    ${TESTDATA_DIR}${/}robot.pdf:2-10,15
                ...    ${TESTDATA_DIR}${/}approved.png
                Add Files To PDF    ${files}    newdoc.pdf

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            list_of_files = [
                'invoice.pdf',
                'approved.png:center',
                'robot.pdf:1',
                'approved.png:x=0,y=0',
            ]
            def example_keyword():
                pdf.add_files_to_pdf(
                    files=list_of_files,
                    target_document="output/output.pdf"
                )

        :param files: list of filepaths to add into PDF (can be either images or PDFs)
        :param target_document: filepath of target PDF
        """
        writer = PyPDF2.PdfFileWriter()

        for f in files:
            file_to_add = Path(f)
            namesplit = file_to_add.name.rsplit(":", 1)
            basename = namesplit[0]
            parameters = namesplit[1] if len(namesplit) == 2 else None
            file_to_add = file_to_add.parent / basename
            image_filetype = imghdr.what(str(file_to_add))
            self.logger.info("File %s type: %s" %
                             (str(file_to_add), image_filetype))
            if basename.endswith(".pdf"):
                reader = PyPDF2.PdfFileReader(str(file_to_add), strict=False)
                pagecount = reader.getNumPages()
                pages = self._get_pages(pagecount, parameters)
                for n in pages:
                    try:
                        page = reader.getPage(n - 1)
                        writer.addPage(page)
                    except IndexError:
                        self.logger.warning("File %s does not have page %d" %
                                            (file_to_add, n))
            elif image_filetype in ["png", "jpg", "jpeg", "gif"]:
                temp_pdf = os.path.join(tempfile.gettempdir(), "temp.pdf")
                pdf = FPDF()
                pdf.set_margin(0)
                pdf.add_page()
                x, y, width, height = self._get_image_coordinates(
                    str(file_to_add), parameters)
                pdf.image(name=file_to_add, x=x, y=y, w=width, h=height)
                pdf.output(name=temp_pdf)

                reader = PyPDF2.PdfFileReader(temp_pdf)
                writer.addPage(reader.getPage(0))

        with open(target_document, "wb") as f:
            writer.write(f)
import PyPDF2
import os
import sys

if len(sys.argv) != 4:
    raise RuntimeError('Usage {} watermark.pdf source.pdf output.pdf'.format(
        sys.argv[0]))

watermark_path = sys.argv[1]
if not os.path.isfile(watermark_path):
    raise RuntimeError('Watermark file {} not found'.format(watermark_path))
watermark = PyPDF2.PdfFileReader(open(watermark_path, 'rb'))

source_path = sys.argv[2]
if not os.path.isfile(source_path):
    raise RuntimeError('Source file {} not found'.format(source_path))
source = PyPDF2.PdfFileReader(open(source_path, 'rb'))

watermark_page = watermark.getPage(0)

pdf_output = PyPDF2.PdfFileWriter()

for pageNum in range(0, source.numPages):
    pageObj = source.getPage(pageNum)
    pageObj.mergePage(watermark_page)
    pdf_output.addPage(pageObj)

output_file = open(sys.argv[3], 'wb')
pdf_output.write(output_file)
output_file.close()
Example #6
0
def exportPdf(fin, abpath_out, annotations):
    '''Export PDF with annotations.

    Args:
        fin (str): abspath to input PDF file.
        abpath_out (str): abspath to output PDF file.
        annotations (dict): annotation info. See import_mendeley.py
                            getHighlights() for more info.
    '''

    try:
        inpdf = PyPDF2.PdfFileReader(open(fin, 'rb'))
        if inpdf.isEncrypted:
            # PyPDF2 seems to think some files are encrypted even
            # if they are not. We just ignore the encryption.
            # This seems to work for the one file where I saw this issue
            #inpdf._override_encryption = True
            #inpdf._flatten()
            # UPDATE: trying to decrypt takes a lot of time,
            # as this rarely happens to academic docs I'm skipping this
            # and simply treat as fail
            #raise Exception("Skip encrypt")
            return
    except IOError:
        LOGGER.warning('Could not open pdf file %s' %fin)

    # retain meta data
    meta = inpdf.getDocumentInfo()
    outpdf = PyPDF2.PdfFileWriter()
    outpdf.addMetadata(meta)

    highlights=annotations.get('highlights',None)
    if highlights is None:
        hlpages=[]
    else:
        hlpages=list(highlights.keys())
        hlpages.sort()

    notes=annotations.get('notes',None)
    if notes is None:
        ntpages=[]
    else:
        ntpages=list(notes.keys())
        ntpages.sort()

    #----------------Loop through pages----------------
    pages=range(1,inpdf.getNumPages()+1)

    for pii in pages:
        inpg = inpdf.getPage(pii-1)

        #----------------Process highlights----------------
        if pii in hlpages:
            for hjj in highlights[pii]:
                # Changes suggested by matteosecli: add author of highlight:
                anno = pdfannotation.createHighlight(hjj["rect"],
                        author=hjj['author'],
                        cdate=hjj["cdate"], color=hjj['color'])
                inpg=pdfannotation.addAnnotation(inpg,outpdf,anno)

        #------------------Process notes------------------
        if pii in ntpages:
            for njj in notes[pii]:
                note = pdfannotation.createNote(njj["rect"], \
                        contents=njj["content"], author=njj["author"],\
                        cdate=njj["cdate"])
                inpg=pdfannotation.addAnnotation(inpg,outpdf,note)

        outpdf.addPage(inpg)

    #-----------------------Save-----------------------
    if os.path.isfile(abpath_out):
        os.remove(abpath_out)

    with open(abpath_out, mode='wb') as fout:
        outpdf.write(fout)

    LOGGER.debug('Exported annotated pdf.')

    return
Example #7
0
    def rotate_page(
        self,
        pages: ListOrString,
        source_path: str = None,
        output_path: str = None,
        clockwise: bool = True,
        angle: int = 90,
    ) -> None:
        """Rotate pages in source PDF document and save to target PDF document.

        If no source path given, assumes a PDF is already opened.

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Example Keyword
                Rotate Page
                ...          source_path=/tmp/sample.pdf
                ...          output_path=/tmp/output.pdf
                ...          pages=5

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            def rotate_page():
                pages = pdf.rotate_page(
                    source_path="/tmp/sample.pdf",
                    output_path="/tmp/output.pdf",
                    pages=5
                )

        :param pages: page numbers to extract from PDF (numbers start from 0).
        :param source_path: filepath to the source pdf.
        :param output_path: filepath to the target pdf, stored by default
            to `output_directory`.
        :param clockwise: directorion that page will be rotated to, default True.
        :param angle: number of degrees to rotate, default 90.
        """
        # TODO: don't save to a new file every time
        self.switch_to_pdf(source_path)
        reader = self.ctx.active_pdf_document.reader
        writer = PyPDF2.PdfFileWriter()

        output_filepath = Path(
            output_path) if output_path else self.default_output

        pages = self._get_page_numbers(pages, reader)
        for page in range(reader.getNumPages()):
            source_page = reader.getPage(int(page))
            if page in pages:
                if clockwise:
                    source_page.rotateClockwise(int(angle))
                else:
                    source_page.rotateCounterClockwise(int(angle))
            else:
                source_page = reader.getPage(int(page))
            writer.addPage(source_page)
        with open(str(output_filepath), "wb") as f:
            writer.write(f)
Example #8
0
pdf_reader.getIsEncrypted()  # False

pdf_reader.getNumPages()  # It will return number of pages in pdf # 3


page1 = pdf_reader.getPage(0)
page1.extractText()


page2 = pdf_reader.getPage(1)
page2.extractText()


# Append Write or Merge PDf

pdf_writer = pdf.PdfFileWriter()


pdf_writer.addPage(page1)
pdf_writer.addPage(page2)


output = open('E:\Projects\Extract Text From PDF File\Pages.pdf','wb')    

pdf_writer.write(output)

output.close()



Example #9
0
def pdf_replace(request):
    if request.method == 'POST':
        # 如果用户通过POST提交
        form = PdfReplaceForm(request.POST, request.FILES)
        if form.is_valid():
            # 获取需要插入的PDF页面文件1
            f1 = form.cleaned_data['file1']
            # 获取需要被替换的文件2
            f2 = form.cleaned_data['file2']
            # 获取替换页码数
            page = form.cleaned_data['page']

            # 获取文件2总页数
            pdfFileObj = PyPDF2.PdfFileReader(f2)
            total_page = pdfFileObj.getNumPages()

            # 获取文件2第一部分-人为可读页码
            page_start = 1
            page_end = page - 1

            pdfOutputFile1 = open(os.path.join('media', 'part_1.pdf'), 'wb+')
            # 利用PyPDF2创建新的Pdf Writer
            pdfWriter = PyPDF2.PdfFileWriter()

            for page_num in range(page_start, page_end + 1):
                # pdf文档页码对象编码是从0开始,所以减一
                page_index = int(page_num) - 1

                # 利用PyPDF2提取页码对象
                pageObj = pdfFileObj.getPage(page_index)  # 从0编码

                # 添加已读取的页面对象
                pdfWriter.addPage(pageObj)

            pdfWriter.write(pdfOutputFile1)
            pdfOutputFile1.close()

            # 获取文件2第2部分-人为可读页码
            page_start = page + 1
            page_end = total_page

            pdfOutputFile2 = open(os.path.join('media', 'part_2.pdf'), 'wb+')
            # 利用PyPDF2创建新的Pdf Writer
            pdfWriter = PyPDF2.PdfFileWriter()

            for page_num in range(page_start, page_end + 1):
                # pdf文档页码对象编码是从0开始,所以减一
                page_index = int(page_num) - 1
                # 利用PyPDF2提取页码对象
                pageObj = pdfFileObj.getPage(page_index)  # 从0编码

                # 添加已读取的页面对象
                pdfWriter.addPage(pageObj)

            pdfWriter.write(pdfOutputFile2)
            pdfOutputFile2.close()

            f2_part_1 = open(os.path.join('media', 'part_1.pdf'), 'rb+')
            f2_part_2 = open(os.path.join('media', 'part_2.pdf'), 'rb+')

            # 创建PDF文件合并对象,添加合并文件
            pdfMerger = PyPDF2.PdfFileMerger()
            pdfMerger.append(PyPDF2.PdfFileReader(f2_part_1))
            pdfMerger.append(PyPDF2.PdfFileReader(f1))
            pdfMerger.append(PyPDF2.PdfFileReader(f2_part_2))

            # 将合并文件对象写入到replaced_file.pdf
            with open(os.path.join('media', 'replaced_file.pdf'),
                      'wb') as pdfOutputFile:
                pdfMerger.write(pdfOutputFile)

            # 打开合并的replaced_file.pdf,通过HttpResponse输出
            response = FileResponse(
                open(os.path.join('media', 'replaced_file.pdf'), 'rb'))
            response['content_type'] = "application/octet-stream"
            response[
                'Content-Disposition'] = 'attachment; filename="replaced_file.pdf"'

            return response

        else:
            # 如果通过POST提交,但表单未通过验证
            form = PdfReplaceForm()

    else:
        # 如果用户没有通过POST,提交生成空表单
        form = PdfReplaceForm()

    return render(request, 'pdf/pdf_replace.html', {'form': form})
Example #10
0
                                  ):  # Loop, for each page of the input pdf
                page = input_reader.getPage(
                    page_num
                )  # Creating the Page Object for each page of the input pdf
                page.mergePage(
                    watermark_page
                )  # Merging pages using the method of the Page Object
                writer_object.addPage(
                    page
                )  # Passing the merged page to the writer object with addPage method
                with open(
                        'super_watermarked.pdf', 'wb'
                ) as result_file:  # Creating the new archive, with merged page
                    writer_object.write(
                        result_file
                    )  # Writing each merged page into new archive


# watermark('super.pdf', 'wtr.pdf.pdf')

template_pdf = PyPDF2.PdfFileReader(open('super.pdf', 'rb'))
watermark_pdf = PyPDF2.PdfFileReader(open('wtr.pdf.pdf', 'rb'))
writer_object = PyPDF2.PdfFileWriter()

for i in range(template_pdf.numPages):
    page = template_pdf.getPage(i)
    page.mergePage(watermark_pdf.getPage(0))
    writer_object.addPage(page)
    with open('watermarked_output.pdf', 'wb') as file:
        writer_object.write(file)
Example #11
0
import PyPDF2 as ppdf

pdffile1 = open('meetingminutes1.pdf', 'rb')
reader1 = ppdf.PdfFileReader(pdffile1)

#print(reader.numPages)

#page=reader.getPage(0);

#print(page.extractText())

pdffile2 = open('meetingminutes2.pdf', 'rb')
reader2 = ppdf.PdfFileReader(pdffile2)

writer = ppdf.PdfFileWriter()

for p1 in range(reader1.numPages):
    if p1 % 2:
        page = reader1.getPage(p1 - 1)
    else:
        page = reader1.getPage(p1)
    writer.addPage(page)

for p2 in range(reader2.numPages):
    page = reader2.getPage(p2)
    writer.addPage(page)

outputfile = open('combined.pdf', 'wb')
writer.write(outputfile)
pdffile1.close()
pdffile2.close()
    def split_pdf_file(self):
        if not self.pdf_name.text():
            error_msg = QtWidgets.QMessageBox(self)
            error_msg.setIcon(QtWidgets.QMessageBox.Warning)
            error_msg.setText('For processing please choose pdf-file!')
            error_msg.setWindowTitle('Error split pdf-file')
            error_msg.exec_()
            return
        if not self.export_directory.text():
            error_msg = QtWidgets.QMessageBox(self)
            error_msg.setIcon(QtWidgets.QMessageBox.Warning)
            error_msg.setText('For processing please choose export directory!')
            error_msg.setWindowTitle('Error split pdf-file')
            error_msg.exec_()
            return
        self.progressBar.show()
        self.label_help.hide()
        self.label_qty.show()
        try:
            pdfFileObj = open(self.pdf_name.text(), 'rb')
        except Exception:
            error_msg = QtWidgets.QMessageBox(self.centralwidget)
            error_msg.setIcon(QtWidgets.QMessageBox.Warning)
            error_msg.setText('Error open pdf-file!')
            error_msg.setWindowTitle('Error split pdf-file')
            error_msg.exec_()
            return

        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        self.progressBar.setRange(0, pdfReader.numPages - 1)
        maxVal = self.progressBar.maximum()
        writer = PyPDF2.PdfFileWriter()
        qty_suppl = 0
        for i in range(0, pdfReader.numPages):
            self.progressBar.setValue(i + (maxVal - i) / 100)
            QtWidgets.QApplication.processEvents()
            pageObj = pdfReader.getPage(i)
            check_text = pageObj.extractText().split()
            if len(check_text) > 5 and check_text[1] == '10.11.98':
                if i == 0:
                    suppl_no = list(
                        filter(lambda f: len(f) == 10 and f[:2] == '26',
                               check_text))[0][-5:]
                    writer.addPage(pdfReader.getPage(i))
                else:
                    output_file = '\\Reconciliation_act~Acc_act~60~' + suppl_no + '~' + datetime.date.today(
                    ).strftime('%Y%m%d') + '.pdf'
                    try:
                        with open(self.export_directory.text() + output_file,
                                  'wb') as outfile:
                            writer.write(outfile)
                            qty_suppl += 1
                            self.label_qty.setText('Processed: ' +
                                                   str(qty_suppl) +
                                                   ' supplier(s)')
                            QtWidgets.QApplication.processEvents()
                    except Exception:
                        error_msg = QtWidgets.QMessageBox(self.centralwidget)
                        error_msg.setIcon(QtWidgets.QMessageBox.Warning)
                        error_msg.setText('Error save pdf-file!')
                        error_msg.setWindowTitle('Error split pdf-file')
                        error_msg.exec_()
                        return
                    outfile.close()
                    suppl_no = list(
                        filter(lambda f: len(f) == 10 and f[:2] == '26',
                               check_text))[0][-5:]
                    writer = PyPDF2.PdfFileWriter()
                    writer.addPage(pdfReader.getPage(i))
            else:
                if len(check_text) > 5:
                    writer.addPage(pdfReader.getPage(i))
        output_file = '\\Reconciliation_act~Acc_act~60~' + suppl_no + '~' + datetime.date.today(
        ).strftime('%Y%m%d') + '.pdf'
        try:
            with open(self.export_directory.text() + output_file,
                      'wb') as outfile:
                writer.write(outfile)
                outfile.close()
                qty_suppl += 1
                self.label_qty.setText('Processed: ' + str(qty_suppl) +
                                       ' supplier(s)')
        except Exception:
            error_msg = QtWidgets.QMessageBox(self.centralwidget)
            error_msg.setIcon(QtWidgets.QMessageBox.Warning)
            error_msg.setText('Error save pdf-file!')
            error_msg.setWindowTitle('Error split pdf-file')
            error_msg.exec_()
            return
        success_msg = QtWidgets.QMessageBox(self)
        success_msg.setIcon(QtWidgets.QMessageBox.Information)
        success_msg.setText('Processed: ' + str(qty_suppl) + ' supplier(s)')
        QtWidgets.QApplication.processEvents()
        success_msg.setWindowTitle('Success split pdf-file')
        success_msg.exec_()
        self.progressBar.hide()
        self.label_help.show()
Example #13
0
def lambda_handler(event, context):
    # Get the object from the event and show its content type
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'],
                                    encoding='utf-8')

    print(bucket, key)

    try:
        # バケツの取得
        response = s3.get_object(Bucket=bucket, Key=key)

        # レスポンスの内容の確認
        if response['ContentType'] != 'application/zip' and response[
                'ContentType'] != 'application/x-zip-compressed' and response[
                    'ContentType'] != 'binary/octet-stream':
            return 'Unsupported extension. : ' + response['ContentType']

        # src/yyyymmddhhmmssfff/filename.zip
        keyItems = key.split('/')
        createDate = keyItems[1]
        zipname = keyItems[2]

        # 拡張をpdfに変更
        name, ext = getNameAndExtention(zipname)

        # ワークフォルダと格納パスの作成
        global basedir

        pdfname = name + '.pdf'  # pdfのファイル名
        zipPath = os.path.join(basedir, createDate + "_" + zipname)  # zipの保存名
        imgPath = os.path.join(basedir, 'img', createDate, name)  # imgの解凍先
        pdfPath = os.path.join(basedir, createDate + "_" + pdfname)  # pdfの保存名
        pdfkey = '/'.join(['dst', createDate, pdfname])  # pdfの保存key
        os.makedirs(imgPath, exist_ok=True)
        print(zipPath)
        print(imgPath)
        print(pdfPath)

        # S3から/tmpにダウンロード
        s3_client.download_file(bucket, key, zipPath)

        # 解凍処理
        unzip(zipPath, imgPath, ext)

        # 画像を検索
        imgList = []
        searchImg(imgList, imgPath)

        # imageをPDFに置き換え
        pdflist = []
        changeImg2Pdf(imgList, pdflist)

        # PDFの読み書き
        pdfWriter = PyPDF2.PdfFileWriter()
        margePdf(pdflist, pdfWriter)
        writePdf(pdfWriter, pdfPath)

        # S3にPDFをアップロード
        s3_client.upload_file(pdfPath, bucket, pdfkey)

        return 'success!!!'

    except Exception as e:
        print(e)
        print(
            'Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'
            .format(key, bucket))
        raise e
Example #14
0
file1 = open(file_path, 'rb')
read1 = PyPDF2.PdfFileReader(file1)
page1 = read1.getPage(0)
#page.rotateClockwise(90)

#pdf = PdfFileReader(file('Java Printing.pdf'))
page = read1.getPage(0).mediaBox
if page.getUpperRight_x() - page.getUpperLeft_x() > page.getUpperRight_y(
) - page.getLowerRight_y():
    print('Landscape')
else:
    print('Portrait')
    page1.rotateClockwise(90)

# save (rotated) page
file2 = PyPDF2.PdfFileWriter()
file2.addPage(page1)
result = open('Balances.pdf', 'wb')
file2.write(result)
result.close()
file1.close()

# read from pdf
fp = open('Balances.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
Example #15
0
def send_public_report(rating_decision_obj):
    """
    Download public report from AWS S3 and send to issuer.

    :param RatingDecision: rating decision object
    """
    target_file = None

    # Send this file to the issuer
    # Fetch uploaded files from AWS S3
    document = get_public_report(rating_decision_obj)

    try:
        legal_name = rating_decision_obj.issuer.legal_name
        legal_name = str(
            unicodedata.normalize('NFKD',
                                  legal_name).encode('ASCII',
                                                     'ignore').decode('utf-8'))

        target_file = (legal_name + ' draft report' + ' (' +
                       datetime.datetime.today().strftime('%Y-%m-%d, %H%M') +
                       ").pdf")

        filepath = AWS_ANALYTICAL_MEDIA_LOCATION + '/' + str(document.upload)

        download_file(filepath, target_file)

        # Password protect file
        path, filename = os.path.split(target_file)
        output_file = os.path.join(path, "temp_" + filename)
        output = PyPDF2.PdfFileWriter()
        input_stream = PyPDF2.PdfFileReader(open(target_file, "rb"))

        for i in range(0, input_stream.getNumPages()):
            output.addPage(input_stream.getPage(i))

        outputStream = open(output_file, "wb")

        user_password = urandom(16).hex()[:5]

        # Set user and owner password to pdf file
        output.encrypt(user_pwd=user_password,
                       owner_pwd='owner_pass',
                       use_128bit=True)
        output.write(outputStream)
        outputStream.close()

        # Rename temporary output file with original filename, this
        # will automatically delete temporary file
        os.rename(output_file, target_file)

        attachments = []  # start with an empty list

        # add the attachment to the list
        attachments.append(target_file)

        contact_list = list(
            RatingDecisionInsiderLink.objects.filter(
                rating_decision=rating_decision_obj))

        to_list = []
        for row in contact_list:
            to_list.append(row.insider.email)

        # Send email with link to admin control to editor
        send_email.delay(
            header=ISSUER_HEADER,
            body=ISSUER_EMAIL %
            (rating_decision_obj.issuer.analyst.primary_analyst.first_name),
            to=to_list,
            from_sender=None,
            cc=rating_decision_obj.issuer.analyst.primary_analyst.email,
            attachments=attachments)

        # Send email with password to primary analyst
        send_email.delay(
            header=ISSUER_EMAIL_HEADER_PASSWORD,
            body=ISSUER_EMAIL_BODY_PASSWORD.format(
                target_file, user_password,
                rating_decision_obj.issuer.analyst.primary_analyst.first_name),
            to=to_list,
            from_sender=None,
            cc=rating_decision_obj.issuer.analyst.primary_analyst.email,
        )

        # Delete file
        later = datetime.datetime.utcnow() + timedelta(minutes=10)
        files = [os.path.abspath(target_file)]
        delete_files_task.apply_async(files, eta=later)

    except AnalyticalDocument.DoesNotExist:
        pass
Example #16
0
import PyPDF2

with open('super.pdf', 'rb') as file:
    reader = PyPDF2.PdfFileReader(file)
    print(reader.numPages)

    with open('wtr.pdf', 'rb') as file_wtm:
        watermark = PyPDF2.PdfFileReader(file_wtm)
        lst = []
        for page in range(0, reader.numPages):
            n_page = reader.getPage(page)
            n_page_watermark = watermark.getPage(0)
            n_page.mergePage(n_page_watermark)
            lst.append(n_page)

        pdf_writer = PyPDF2.PdfFileWriter()

        for i in lst:
            pdf_writer.addPage(i)

        with open('output.pdf', 'wb') as output_file:
            pdf_writer.write(output_file)

#Or this way --> more elegant way :)

template = PyPDF2.PdfFileReader(open('super.pdf', 'rb'))
watermark2 = PyPDF2.PdfFileReader(open('wtr.pdf', 'rb'))
output = PyPDF2.PdfFileWriter()

for i in range(template.getNumPages()):
    page = template.getPage(i)
然后从已有的文档中拷贝内容到新文件中
有关PDF文件操作一般遵循以下步骤:

1.打开一个或者多个已有的PDF文件,得到PdfFileReader对象
2.创建一个新的PdfFileWriter对象
3.将页面从PdfFileReader对象拷贝到PdfFileWriter对象
4.最后利用PdfFileWriter对象的write()写入新创建的PDF文件
'''

minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)  # 创建PdfFileReader对象

minutesFirstPage = pdfReader.getPage(0)
pdfWatermarkReader = PyPDF2.PdfFileReader(open('watermark.pdf',
                                               'rb'))  # 创建PdfFileReader对象

minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))

pdfWriter = PyPDF2.PdfFileWriter()  # 创建PdfFileWriter对象
pdfWriter.addPage(minutesFirstPage)

for pageNum in range(1, pdfReader.numPages):
    pageObj = pdfReader.getPage(pageNum)
    pdfWriter.addPage(pageObj)

resultPdfFile = open('watermarkedCover.pdf', 'wb')
pdfWriter.write(resultPdfFile)  # 创建新的PDF文件,并将PdfFileWriter内容写进该文件

minutesFile.close()
resultPdfFile.close()
import PyPDF2 as pdf
import os

SONGS_PATH = "C:/Users/tomas/zpevnik"

paths = sorted(os.listdir(SONGS_PATH))
paths.insert(39, paths.pop())  # hack: dá Čechomor na správné místo v abecedě

files = [open(os.path.join(SONGS_PATH, path), "rb") for path in paths]

output = pdf.PdfFileWriter()

for i, (file, f) in enumerate(zip(paths, files)):
    print(f"Working on {i}: ", end=" ")
    page = pdf.PdfFileReader(f)
    output.addPage(page.getPage(0))

    page_name = os.path.splitext(os.path.basename(file))[0]
    print(page_name)

    output.addBookmark(page_name, i)

with open("out.pdf", "wb") as f:
    output.write(f)
Example #19
0
    def extract_pages_from_pdf(
        self,
        source_path: str = None,
        output_path: str = None,
        pages: ListOrString = None,
    ) -> None:
        """Extract pages from source PDF and save to a new PDF document.

        Page numbers start from 1.

        If no source path given, assumes a PDF is already opened.

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Example Keyword
                ${pages}=    Extract Pages From PDF
                ...          source_path=/tmp/sample.pdf
                ...          output_path=/tmp/output.pdf
                ...          pages=5

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            def example_keyword():
                pages = pdf.extract_pages_from_pdf(
                    source_path="/tmp/sample.pdf",
                    output_path="/tmp/output.pdf",
                    pages=5
                )

        :param source_path: filepath to the source pdf.
        :param output_path: filepath to the target pdf, stored by default
            in `output_directory`.
        :param pages: page numbers to extract from PDF (numbers start from 0)
            if None then extracts all pages.
        """
        self.switch_to_pdf(source_path)
        reader = self.ctx.active_pdf_document.reader
        writer = PyPDF2.PdfFileWriter()

        output_filepath = Path(
            output_path) if output_path else self.default_output

        pages = self._get_page_numbers(pages, reader)
        for pagenum in pages:
            writer.addPage(reader.getPage(int(pagenum) - 1))
        with open(str(output_filepath), "wb") as f:
            writer.write(f)
Example #20
0
def make_writer(pdf):
    with open(pdf, 'rb') as f:
        r = PyPDF2.PdfFileReader(f)
        w = PyPDF2.PdfFileWriter()
        w.appendPagesFromReader(r)
    return w
Example #21
0
    def add_watermark_image_to_pdf(
        self,
        image_path: str,
        output_path: str,
        source_path: str = None,
        coverage: float = 0.2,
    ) -> None:
        """Add image to PDF which can be new or existing PDF.

        If no source path given, assumes a PDF is already opened.

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Example Keyword
                Add Watermark Image To PDF
                ...             image_path=approved.png
                ...             source_path=/tmp/sample.pdf
                ...             output_path=output/output.pdf

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            def example_keyword():
                pdf.add_watermark_image_to_pdf(
                    image_path="approved.png"
                    source_path="/tmp/sample.pdf"
                    output_path="output/output.pdf"
                )

        :param image_path: filepath to image file to add into PDF
        :param source: filepath to source, if not given add image to currently
            active PDF
        :param output_path: filepath of target PDF
        :param coverage: how the watermark image should be scaled on page,
         defaults to 0.2
        """
        self.switch_to_pdf(source_path)
        temp_pdf = os.path.join(tempfile.gettempdir(), "temp.pdf")
        writer = PyPDF2.PdfFileWriter()
        pdf = FPDF()
        pdf.add_page()
        reader = self.ctx.active_pdf_document.reader
        mediabox = reader.getPage(0).mediaBox
        im = Image.open(image_path)
        max_width = int(float(mediabox.getWidth()) * coverage)
        max_height = int(float(mediabox.getHeight()) * coverage)
        width, height = self.fit_dimensions_to_box(*im.size, max_width,
                                                   max_height)

        pdf.image(name=image_path, x=40, y=60, w=width, h=height)
        pdf.output(name=temp_pdf)

        img = PyPDF2.PdfFileReader(temp_pdf)
        watermark = img.getPage(0)
        for n in range(reader.getNumPages()):
            page = reader.getPage(n)
            page.mergePage(watermark)
            writer.addPage(page)

        with open(output_path, "wb") as f:
            writer.write(f)
Example #22
0
import os
import PyPDF2

import glob

# todo learn gitIgnore

filePaths = glob.glob("Data/*.pdf")

with open("Password", mode="r", encoding="utf-8") as f:
    password = f.read().replace("\n", "")

for i in filePaths:
    pdf = PyPDF2.PdfFileReader(i)
    pdf.decrypt(password)
    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.cloneReaderDocumentRoot(pdf)

    with open(i, mode="wb") as f:
        dst_pdf.write(f)
Example #23
0
# ch18_8.py
import PyPDF2, os, re

try:
    for dirName, sub_dirNames, fileNames in os.walk(
            'F:\\Book\\電腦書\\Python入門\\範例檔案\\ch18'):
        for fn in fileNames:
            txt = re.search(r'\w*\.pdf', fn, re.IGNORECASE)
            if txt != None:
                pdfFn = txt.group()
                pdfObj = open(pdfFn, 'rb')
                pdfRd = PyPDF2.PdfFileReader(pdfObj)
                pdfWr = PyPDF2.PdfFileWriter()  # 新的PDF物件
                for pageNum in range(pdfRd.numPages):
                    pdfWr.addPage(pdfRd.getPage(pageNum))  # 一次將一頁放入新的PDF物件
                pdfWr.encrypt('python')  # 執行加密
                refn = re.search(r'\w+[^.]', pdfFn, re.IGNORECASE)
                encryptPdf = refn.group()
                encryptPdf = open(encryptPdf + '_encryt.pdf',
                                  'wb')  # 開啟二進位檔案供寫入
                pdfWr.write(encryptPdf)  # 執行寫入
                print(pdfFn + '寫入成功')
                encryptPdf.close()
except FileNotFoundError as e:
    print(e)
finally:
    print('程式結束!!')
Example #24
0
import PyPDF2
import sys
import os

template = sys.argv[1]
watermark = sys.argv[2]

template1 = PyPDF2.PdfFileReader(open(template, 'rb')) #creates a file reader
watermark1 = PyPDF2.PdfFileReader(open(watermark, 'rb'))
output = PyPDF2.PdfFileWriter() #creates a write object

for i in range(template1.getNumPages()):#getNumPages gives how many pages the template file has
	page = template1.getPage(i) #take one page at a time
	page.mergePage(watermark1.getPage(0)) #watermark file has only one page
	output.addPage(page)

	with open('watermarked_output2.pdf', 'wb') as file:
		output.write(file)


pdfFile = open('file.pdf', 'rb')  # must open in read binary mode

reader = PyPDF2.PdfFileReader(pdfFile)
print(reader.numPages)  # number of pages in file
page = reader.getPage(0)  # gets specific page
print(page.extractText())

for pageNum in range(reader.numPages):
    print(reader.getPage(pageNum).extractText())

# can add remove and reorder pages can not edit text

pdf1 = open('file1', 'rb')
pdf2 = open('file2', 'rb')
reader1 = PyPDF2.PdfFileReader(pdf1)
reader2 = PyPDF2.PdfFileReader(pdf2)
writer = PyPDF2.PdfFileWriter()  # blank pdf in memory
for pageNum in range(reader1.numPages):
    page = reader1.getPage(pageNum)
    writer.addPage(page)

for pageNum in range(reader2.numPages):
    page = reader2.getPage(pageNum)
    writer.addPage(page)

outputFile = open('combinedPdf.pdf', 'wb')
writer.write(outputFile)
outputFile.close()
pdf1.close()
pdf2.close()
Example #26
0
def exportPdf(fin, outdir, annotations, verbose):
    '''Export PDF with annotations.

    <fin>: string, absolute path to input PDF file.
    <outdir>: string, absolute path to the output directory.
    <annotations>: FileAnno obj.

    Update time: 2016-02-19 14:32:56.
    '''

    #---------------Skip unlinked files---------------
    assert annotations.hasfile, 'no file of %s' % fin
    #if not annotations.hasfile:
    #return

    try:
        inpdf = PyPDF2.PdfFileReader(open(fin, 'rb'))
        if inpdf.isEncrypted:
            # PyPDF2 seems to think some files are encrypted even
            # if they are not. We just ignore the encryption.
            # This seems to work for the one file where I saw this issue
            #inpdf._override_encryption = True
            #inpdf._flatten()
            # UPDATE: trying to decrypt takes a lot of time,
            # as this rarely happens to academic docs I'm skipping this
            # and simply treat as fail
            raise Exception("Skip encrypt")
    except IOError:
        print('Could not find pdf file %s' % fin)

    # retain meta data
    meta = inpdf.getDocumentInfo()
    outpdf = PyPDF2.PdfFileWriter()
    outpdf.addMetadata(meta)

    #----------------Loop through pages----------------
    pages = range(1, inpdf.getNumPages() + 1)

    for pii in pages:

        inpg = inpdf.getPage(pii - 1)

        #----------------Process highlights----------------
        if pii in annotations.hlpages:
            for hjj in annotations.highlights[pii]:
                # Changes suggested by matteosecli: add author of highlight:
                anno = pdfannotation.createHighlight(hjj["rect"],
                                                     author=hjj['author'],
                                                     cdate=hjj["cdate"],
                                                     color=hjj['color'])
                inpg = pdfannotation.addAnnotation(inpg, outpdf, anno)

        #------------------Process notes------------------
        if pii in annotations.ntpages:
            for njj in annotations.notes[pii]:
                note = pdfannotation.createNote(njj["rect"], \
                        contents=njj["content"], author=njj["author"],\
                        cdate=njj["cdate"])
                inpg = pdfannotation.addAnnotation(inpg, outpdf, note)

        outpdf.addPage(inpg)

    # this is a fix provided by rongmu regarding issue #22:
    # https://github.com/Xunius/Menotexport/issues/22
    # As I'm not giving it tests I'll put it inside a try

    # fix start ----------------------------------{{{
    # Copy the root (document catalog) except for /Pages
    # PDF Reference, Sixth Edition, version 1.7, p.137
    # https://www.adobe.com/devnet/pdf/pdf_reference_archive.html
    try:
        for k, v in inpdf.trailer["/Root"].items():
            if k.getObject() != "/Pages":
                outpdf._root_object.update({k: v})
    except:
        pass
    # fix end ----------------------------------}}}

    #-----------------------Save-----------------------
    filename = annotations.filename
    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    abpath_out = os.path.join(outdir, filename)
    if os.path.isfile(abpath_out):
        os.remove(abpath_out)

    with open(abpath_out, mode='wb') as fout:
        outpdf.write(fout)

    return
Example #27
0
    pdf_path = "./PDF/"
    #os.chdir(pdf_path)

    #Get the name of the parent pdf files from user
    #This file will be demerged into 2 files
    parent_pdf = getFileNameFromUser("Give parent PDF name with .pdf extn",
                                     pdf_path)

    #Create file objects for both the files
    pdfFileObj = open(os.path.join(pdf_path + parent_pdf), "rb")

    #Pass the file objects to the file reader
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    #create a Pdf writer object
    pdfWriter1 = PyPDF2.PdfFileWriter()
    pdfWriter2 = PyPDF2.PdfFileWriter()

    #Add individual pages from pdf files to writer object
    addPageToWriter(pdfReader, pdfWriter1, 0, pdfReader.numPages / 2)
    addPageToWriter(pdfReader, pdfWriter2, pdfReader.numPages / 2,
                    pdfReader.numPages)

    pdfOutputFileObj1 = open(
        os.path.join(pdf_path + getFinalPdfNameFromUser()), "wb")
    pdfOutputFileObj2 = open(
        os.path.join(pdf_path + getFinalPdfNameFromUser()), "wb")
    print(".. Demerging parent pdf file....")
    pdfWriter1.write(pdfOutputFileObj1)
    pdfWriter2.write(pdfOutputFileObj2)
    print("... done...")
Example #28
0
import PyPDF2 as p

template = p.PdfFileReader(open('merged_pdf.pdf', 'rb'))
water = p.PdfFileReader(open('pdf3.pdf', 'rb'))
output = p.PdfFileWriter()

for i in range(template.getNumPages()):
    page = template.getPage(i)
    page.mergePage(water.getPage(0))
    output.addPage(page)

with open('watermarkedPdf.pdf', 'wb') as file:
    output.write(file)
Example #29
0
    fileList.append(pdfFileObj)

# Create PdfFileReader objects from PDF file list
readerList = []
for pdfFile in fileList:
    pdfReader = PyPDF2.PdfFileReader(pdfFile)
    readerList.append(pdfReader)

totalPages = 0
for pr in readerList:
    print(pr.numPages)
    totalPages += pr.numPages
print("total pages:", totalPages)

# combine read PDF files
pdfWriter = PyPDF2.PdfFileWriter()
for pr in readerList:
    for pageNum in range(pr.numPages):
        pageObj = pr.getPage(pageNum)
        pdfWriter.addPage(pageObj)

# write to a new PDF file
if os.path.isfile(mergedFile + ".pdf"):
    print('File already exists, specify another filename')
else:
    pdfOutputFile = open(mergedFile + ".pdf", 'wb')
    pdfWriter.write(pdfOutputFile)
    pdfOutputFile.close()

# Close all read files
for pdfFile in fileList:
Example #30
0
import PyPDF2

pdf1 = open('JadaDixon_rates_of_reaction_lab.pdf', 'rb')
pdf2 = open('JadaDixon_reaction_rate_p_and_d_.pdf', 'rb')
pdf3 = open('JadaDixon_Redox_Lab_Template.pdf', 'rb')
pdf4 = open('JadaDixonchemistry.pdf', 'rb')
pdf5 = open('jadadixonenergeticslab.pdf', 'rb')
pdf6 = open('JadaDixonSBaLab2.pdf', 'rb')
pdf7 = open('Chemistry_P_and_D__Jada_Dixon.pdf', 'rb')


pdf1reader = PyPDF2.PdfFileReader(pdf1)
pdf2reader = PyPDF2.PdfFileReader(pdf2)
pdf3reader = PyPDF2.PdfFileReader(pdf3)
pdf4reader = PyPDF2.PdfFileReader(pdf4)
pdf5reader = PyPDF2.PdfFileReader(pdf5)
pdf6reader = PyPDF2.PdfFileReader(pdf6)
pdf7reader = PyPDF2.PdfFileReader(pdf7)


pdfwrite = PyPDF2.PdfFileWriter()

for i in range(pdf1reader.numPages):
    pageO = pdf1reader.getPage(i)
    pdfWRITE