Beispiel #1
0
def split(path, name_of_split):
    """Split a pdf into multiple pdfs which containing each of the pages"""
    read = PdfFileReader(path)
    # For each page in pdf, create a single file of that page
    for page in range(read.getNumPages()):
        # Create reader each time in loop to prevent a crash
        pdf = PdfFileReader(path)
        pdf_writer = PdfFileWriter()
        # Write page to file
        pdf_writer.addPage(pdf.getPage(page))
        # Setting output file name
        output = f'{name_of_split}{page}.pdf'
        with open(output, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)
Beispiel #2
0
def pdf_add_metadata(path, file, key, value, out_file, out_path=''):
    if out_path == '':
        out_path = path
    with open(os.path.join(path, file), 'rb') as pdf:
        try:
            pdf_reader = PdfFileReader(pdf)
            metadata = pdf_reader.getDocumentInfo()
            print(metadata)
            pdf_writer = PdfFileWriter()
            pdf_writer.appendPagesFromReader(pdf_reader)
            pdf_writer.addMetadata({
                '/' + key: value,
                #     # '/Title': 'PDF in Python'
            })
            file_out = open(os.path.join(out_path, out_file), 'wb')
            pdf_writer.write(file_out)
            #
            pdf.close()
            file_out.close()
            # print('File ' + os.path.basename(file) + ' has ' + str(pg) + ' page(s)')
        except ValueError:
            print(ValueError, 'rrrt')
Beispiel #3
0
def pdf_add_Stamp(path, file, stamp):
    with open(os.path.join(path, file), 'rb') as pdf:
        pdf_reader = PdfFileReader(pdf)
        with open(os.path.join(path, stamp), 'rb') as file_stamp:
            watermark = PdfFileReader(file_stamp)
            first_page = pdf_reader.getPage(0)
            first_page_watermark = watermark.getPage(0)

            first_page.mergePage(first_page_watermark)

            pdf_writer = PdfFileWriter()
            pdf_writer.addPage(first_page)
            pdf_writer.addMetadata({
                '/NumeroCadastre': 'LaTeteAToto',
                #     # '/Title': 'PDF in Python'
            })
            file_out = open(os.path.join(path, 'new_w_stamp.pdf'), 'wb')
            pdf_writer.write(file_out)
            #
            pdf.close()
            file_out.close()
            watermark.close()
Beispiel #4
0
def join(paths, output):
    """
    Joins an unlimited number of pdfs together.
    Enter the paths of the pdfs to be joined in the order they will
    appear in the output pdf, and end with the output path.
    """

    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

        pdf_writer.write(output)
def write_pdf_subset(input_path, output_path, page_start, page_end):
    pdf_writer = PdfFileWriter()

    pdf_reader = PdfFileReader(input_path)

    if pdf_reader.getNumPages() < page_end:
        Exception("too few pages")

    for page in range(page_start, page_end):
        # Add each page to the writer object
        pdf_writer.addPage(pdf_reader.getPage(page))

    # Write out the merged PDF
    with open(output_path, 'wb') as out:
        pdf_writer.write(out)
Beispiel #6
0
def create_watermark(input_pdf, output, watermark):
    watermark_obj = PdfFileReader(watermark)
    watermark_page = watermark_obj.getPage(0)

    pdf_reader = PdfFileReader(input_pdf)
    pdf_writer = PdfFileWriter()

    # Watermark all the pages, need to change this to one.
    for page in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page)
        page.mergePage(watermark_page)
        pdf_writer.addPage(page)

    with open(output, 'wb') as out:
        pdf_writer.write(out)
    def _download(self, url, path):
        code = url.split("/")[-1]
        page_size = self._get_pdf_page_size(url)

        pdf_writer = PdfFileWriter()

        for page in trange(1, page_size + 1):
            first_page_io = self._get_one_page_pdf(code, page)
            pdf_reader = PdfFileReader(first_page_io)
            page_r = pdf_reader.getPage(0)
            pdf_writer.addPage(page_r)

        with open(path, "wb") as f:
            pdf_writer.write(f)
        return path
Beispiel #8
0
def Split_pdf(path, file, Out_Path):
    log_file = 'PDF_utils_log'
    pdf = PdfFileReader(os.path.join(path, file))
    for frame in range(pdf.getNumPages()):
        pdf = PdfFileReader(os.path.join(path, file))
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(frame))
        Out_filename = os.path.splitext(file)[0] + "-F" + format(
            (frame + 1), '03d') + ".pdf"
        with open(os.path.join(Out_Path, Out_filename), 'wb') as out:
            pdf_writer.write(out)
            log_info(
                "File " + file + " frame " + format(
                    (frame + 1), '03d') + " successfully splitted", log_file)
        out.close()
Beispiel #9
0
def put_watermark(input_pdf, output_pdf, watermark): # , logo_img):
    # print(f"38: {logo_img}")
    # picture_path = logo_img #'everest_logo.jpg'
    # text = None #'Produite pour'
    #
    # c = canvas.Canvas(watermark)
    #
    # if picture_path:
    #     c.drawImage(picture_path, 420, 560)
    #
    # if text:
    #     c.setFontSize(14)
    #     c.setFont('Helvetica-Bold', 14)
    #     c.drawString(45, 20, text)
    #
    # c.save()

    # reads the watermark pdf file through
    # PdfFileReader
    watermark_instance = PdfFileReader(watermark)

    # fetches the respective page of
    # watermark(1st page)
    watermark_page = watermark_instance.getPage(0)

    # reads the input pdf file
    pdf_reader = PdfFileReader(input_pdf)

    # It creates a pdf writer object for the
    # output file
    pdf_writer = PdfFileWriter()

    # iterates through the original pdf to
    # merge watermarks
    for page in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page)

        # will overlay the watermark_page on top
        # of the current page.
        page.mergePage(watermark_page)

        # add that newly merged page to the
        # pdf_writer object.
        pdf_writer.addPage(page)

    with open(output_pdf, 'wb') as out:
        # writes to the respective output_pdf provided
        pdf_writer.write(out)
Beispiel #10
0
def merge_pdfs(file_path, pdf_writer=None, page=None):
    """
    Utility function to compose a PDF file from parts of other PDF files. The
    output of the function is a PdfFileWriter object from PyPDF4. In order to
    save the resulting object output 'pdf_writer' as a PDF file, use the
    following logic in your routine:

    ```
    with open('save/path/here.pdf', 'wb') as out:
        pdf_writer.write(out)
    ```

    @param file_path: path of the PDF file to get pages from.
    @param pdf_writer: PyPDF4 object PdfFileWriter. If None, starts a new writer.
    @param page: int or list of ints. Number of the page from the file to be added.
                 If None, all pages from 'file_path' are added in order. If list of
                 ints, selected pages are added in the order of the list.
    @return: PdfFileWriter object with the new pages added.
    """

    pdf_reader = PdfFileReader(str(file_path))

    if pdf_writer is None:
        pdf_writer = PdfFileWriter()

    if page is None:
        for pg in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(pg))

    else:
        try:
            for pg in page:
                pdf_writer.addPage(pdf_reader.getPage(pg))

        except TypeError:
            pdf_writer.addPage(pdf_reader.getPage(page))

    return pdf_writer
Beispiel #11
0
def readPdf_test_read_write():
    #获取一个pdf对象
    pdf_input = PdfFileReader(open(r'd:/data/pdf-scan/普通生物学(清晰PDF版).pdf',
                                   'rb'))
    #获取pdf页数
    page_count = pdf_input.getNumPages()
    #获取pdf第四页的内容
    page = pdf_input.getPage(3)
    #page.extractText()
    #page['/Contents']
    #获取一个pdfWriter对象
    pdf_output = PdfFileWriter()
    # 将一个 PageObject 加入到 PdfFileWriter 中
    pdf_output.addPage(page)
    #把新pdf保存
    pdf_output.write(open(r'd:/data/pdf-scan/n.pdf', 'wb'))
def frasers2up(inputPdfFileWriter):

    edited_file = Pdfwrite()
    blank_file = Pdfwrite()
    leftpage = inputPdfFileWriter.getPage(0)
    leftx = leftpage.mediaBox.upperRight[0]
    lefty = leftpage.mediaBox.upperRight[1]

    for page in range(0, inputPdfFileWriter.getNumPages() - 1, 2):
        leftpage = inputPdfFileWriter.getPage(page)
        rightpage = inputPdfFileWriter.getPage(page + 1)
        leftx = leftpage.mediaBox.upperRight[0]
        lefty = leftpage.mediaBox.upperRight[1]
        blank_file.insertBlankPage(leftx, lefty, 0)
        blank_page = blank_file.getPage(0)
        blank_page.mergeTranslatedPage(rightpage, leftx, 0, 1)
        blank_page.mergePage(leftpage)
        edited_file.addPage(blank_page)
    #     leftpage, leftx, 0, True)
    return edited_file
Beispiel #13
0
    def split_pdf(self, path: str, name_of_split: str):
        """
        切分pdf文件
        path: 原始文件存放路径
        name_of_split:切分后的文件名,不包含后缀
        """
        pdf = PdfFileReader(path)
        pdf_writer = PdfFileWriter()
        for page in range(pdf.getNumPages()):
            pdf_writer.addPage(pdf.getPage(page))

            output = self.processed + name_of_split + str(page) + ".pdf"
            with open(output, 'wb') as output_pdf:
                pdf_writer.write(output_pdf)

        # print("切分完成!")
        return "切分完成!"
Beispiel #14
0
    def SaveButton(self, event):
        input_filename = Path(self.input_file)
        dlg = wx.FileDialog(self,
                            message="Save file as...",
                            defaultDir=os.getcwd(),
                            defaultFile=(f"{input_filename.with_suffix('')}+"
                                         f"{self.no_pages.GetValue()}"),
                            wildcard=pdfs,
                            style=wx.FD_SAVE | wx.FD_OVERWRITE_PROMPT)

        if not self.input_path:
            self.error_message("Please select a pdf")

        elif dlg.ShowModal() == wx.ID_OK:
            output_path = dlg.GetPath()
            pdf_writer = PdfFileWriter()
            pdf_reader = PdfFileReader(self.input_path)
            try:
                blank_pages = int(self.no_pages.GetValue())
            except ValueError:
                self.error_message(
                    "The number of pages added must be an integer")
            pdf_writer.appendPagesFromReader(pdf_reader)
            for _ in range(blank_pages):
                pdf_writer.addBlankPage()

            with open(output_path, 'wb') as output:
                pdf_writer.write(output)

            success_dlg = wx.MessageDialog(
                self, f"""You created {dlg.GetFilename()} and saved it at
{dlg.GetPath()}.""", "Success!", wx.OK | wx.ICON_INFORMATION)
            success_dlg.ShowModal()
            success_dlg.Destroy()
            dlg.Destroy()
            self.clear_func()
Beispiel #15
0
def pdf_generate(inpath, generate_list, outpath):
    fname = os.path.splitext(os.path.basename(inpath))[0]

    if not outpath:
        outpath = '{}_output.pdf'.format(fname)

    pdf_writer = PdfFileWriter()

    with open(outpath, 'wb') as out:
        for file in generate_list:
            pdf = PdfFileReader(file)
            for page in range(pdf.getNumPages()):
                pdf_writer.addPage(pdf.getPage(page))

        pdf_writer.write(out)

    print('Created: {}'.format(outpath))
Beispiel #16
0
def split(input_path, page_ranges, output_name):
    """
    Extracts the specified page ranges from a pdf.
    Enter the input paths, then the desired page ranges (as single page numbers or hyphen separated ranges) and finally the output path.
    The output pdfs will be named '<output_name>_p<page_range>'.
    """

    for page_range in page_ranges:
        input_pdf = PdfFileReader(input_path)
        pdf_writer = PdfFileWriter()
        start, stop = format_range(page_range)
        for page in range(start, stop):
            pdf_writer.addPage(input_pdf.getPage(page))

        output = f"{output_name}_p{page_range}.pdf"
        with open(output, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)
def cut_left(file_path, output_file, points=66):

    with open(str(file_path), 'rb') as pfl:
        in_pdf = PdfFileReader(pfl)
        out_pdf = PdfFileWriter()
        n_pages = in_pdf.getNumPages()

        for i in trange(n_pages):
            page = in_pdf.getPage(i)
            # print_box(page)
            p_width = page.mediaBox.getWidth()
            p_height = page.mediaBox.getHeight()

            if p_width > p_height:

                lower_left = page.mediaBox.getLowerLeft()
                lower_left = (lower_left[0]+points, lower_left[1]) #(55.479, 0)

                upper_left = page.mediaBox.getUpperLeft()
                upper_left = (upper_left[0]+points, upper_left[1]) #(55.479, 604.321)

                page.mediaBox.lowerLeft = lower_left
                page.mediaBox.upperLeft = upper_left


            else:
                upper_left = page.mediaBox.getUpperLeft()
                upper_left = (upper_left[0], upper_left[1] - points) #(55.479, 0)

                upper_right = page.mediaBox.getUpperRight()
                upper_right = (upper_right[0], upper_right[1] - points) #(55.479, 604.321)

                page.mediaBox.upperLeft = upper_left
                page.mediaBox.upperRight = upper_right


            page.artBox = page.mediaBox
            page.bleedBox = page.mediaBox
            page.cropBox = page.mediaBox
            # print_box(page)

            out_pdf.addPage(page)

        print("Writing file ...")
        with open(output_file, 'wb') as outfl:
            out_pdf.write(outfl)
Beispiel #18
0
def rotate_pdf(original_filename, new_filename, rotation):
    with open(original_filename, 'rb') as pdf:
        # create a pdf reader object
        pdf_reader = PdfFileReader(pdf)
        # create a pdf writer object for new pdf
        pdf_writer = PdfFileWriter()
        # rotating each page
        for page in range(pdf_reader.numPages):
            # create rotated page object
            page_obj = pdf_reader.getPage(page)
            page_obj.rotateClockwise(rotation)
            # adding rotated page object to pdf writer
            pdf_writer.addPage(page_obj)

        with open(new_filename, 'wb') as new_pdf:
            # writing rotated pages to new file
            pdf_writer.write(new_pdf)
Beispiel #19
0
def watermark(input, watermark, output, verbosity, **kwargs):
    """use first page of pdf and add it as watermark to other document

es. pdfcli watermark wm.pdf source.pdf -o final.pdf

"""
    watermark = PdfFileReader(watermark)
    watermarkpage = watermark.getPage(0)

    pdf = PdfFileReader(input)
    pdfwrite = PdfFileWriter()
    for page in range(pdf.getNumPages()):
        pdfpage = pdf.getPage(page)
        pdfpage.mergePage(watermarkpage)
        pdfwrite.addPage(pdfpage)

    pdfwrite.write(output)
Beispiel #20
0
def paginate_pdf(pdf_name, number_page, pagination_template):
    """Create a new pdf with the pagination footer
	starting from the second page of the manual.
	"""

    writer = PdfFileWriter()

    stream_manuals = open(pdf_name, "rb")
    manuals = PdfFileReader(stream_manuals)
    stream_pagination = open(pagination_template, "rb")
    pagination = PdfFileReader(stream_pagination)

    for i in range(number_page):
        manuals_page = manuals.getPage(i)  # pageNumber: 0
        pagination_page = pagination.getPage(i)

        # Stack blank page
        translated_page = PageObject.createBlankPage(
            None,
            width=manuals_page.mediaBox.getHeight(),  # width = 1224
            height=manuals_page.mediaBox.getWidth(),  # height = 792
        )

        # Stack pagination
        translated_page.mergePage(pagination_page)

        # Stack manual
        translated_page.mergeRotatedScaledTranslatedPage(manuals_page,
                                                         rotation=-90,
                                                         scale=1,
                                                         tx=0,
                                                         ty=792,
                                                         expand=True)

        writer.addPage(translated_page)

    pdf_out = "./paginatedPDFs/" + pdf_name  # New name of the output pdf file.

    with open(pdf_out, "wb") as _:
        writer.write(_)

    stream_manuals.close()
    stream_pagination.close()

    print(f"{pdf_out} copied to paginatedPDFs\n")
Beispiel #21
0
 def merge_doc(self):
     '''合并封面和文件'''
     self.file_lists = list(
         zip(self.doc_code, self.name_list, self.final_names))
     for pdfnames in self.file_lists:
         output = PdfFileWriter()
         for pdfname in pdfnames[0:2]:
             input = PdfFileReader(open(pdfname, "rb"), strict=False)
             pageCount = input.getNumPages()
             for iPage in range(0, pageCount):
                 output.addPage(input.getPage(iPage))
         pdfoutname = str(pdfnames[2])
         outputStream = open(pdfoutname, "wb")
         output.write(outputStream)
         outputStream.close()
         print("文件合并完成:", pdfoutname)
     print("文件合并完成!")
     print("=><=" * 25)
Beispiel #22
0
    def mergePdf(inFileList, outFile):
        '''
        合并文档
        :param inFileList: 要合并的文档的 list
        :param outFile:    合并后的输出文件
        :return:
        '''
        pdfFileWriter = PdfFileWriter()
        for inFile in inFileList:
            # 依次循环打开要合并文件
            pdfReader = PdfFileReader(open(inFile, 'rb'))
            numPages = pdfReader.getNumPages()
            for index in range(0, numPages):
                pageObj = pdfReader.getPage(index)
                pdfFileWriter.addPage(pageObj)

            # 最后,统一写入到输出文件中
            pdfFileWriter.write(open(outFile, 'wb'))
def put_watermark(input_pdf, output_pdf, watermark):
    watermark_instance = PdfFileReader(
        watermark)  #reads the watermark pdf file through PdfFileReader
    watermark_page = watermark_instance.getPage(
        0)  #fetches the respective page of watermark(1st page)

    pdf_reader = PdfFileReader(
        input_pdf)  # Reads the pdf where watermark is to be placed
    pdf_writer = PdfFileWriter()  #to write the modified pdf with watermarks

    for page in range(pdf_reader.getNumPages()
                      ):  #to loop through every page to put watermark
        page = pdf_reader.getPage(page)
        page.mergePage(watermark_page)
        pdf_writer.addPage(page)

    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)  #writes to the respective output_pdf provided
def merge_pdfs(paths, output):
    pdf_writer = PdfFileWriter()

    for path in paths:
        print(f'path {path[1]} {path[0]}')
        pdf_reader = PdfFileReader(path[0])
        for idx in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(idx)
            if USE_SCALING:
                if path[1] == 'A4':
                    print('scaling...')
                    page.scaleTo(812, 595)
                else:
                    print('merge as is...')
            pdf_writer.addPage(page)

    with open(output, 'wb') as out:
        pdf_writer.write(out)
Beispiel #25
0
def parse(pdf_file):
    """解析PDF文本,并保存到TXT文件中"""
    fp = open(pdf_file, 'rb')
    # 来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        print('nono')
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr = PDFResourceManager()
        # 设定参数进行分析
        laparams = LAParams()
        # 创建一个PDF设备对象
        # device=PDFDevice(rsrcmgr)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理每一页
        pageindex = []
        i = 0
        pattern = re.compile("微信")
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # # 接受该页面的LTPage对象
            layout = device.get_result()  # return text image line curve
            for x in layout:
                if isinstance(x, LTText):
                    if pattern.search(x.get_text()):
                        pageindex.append(i)
            i += 1

    pdf_output = PdfFileWriter()
    pdf_input = PdfFileReader(fp)
    # 获取 pdf 共用多少页
    for j in pageindex:
        pdf_output.addPage(pdf_input.getPage(j))
    final_path = os.path.join(r"C:\Users\big\Desktop\final.pdf")
    with open(final_path, "wb") as f:
        pdf_output.write(f)
    fp.close()
Beispiel #26
0
    def merge_pdf(self, file_list, outpdf):
        """
        合并pdf文件
        outpdf: 输出的pdf名称,不包含路径,如:merge_res.pdf
        """
        pdf_writer = PdfFileWriter()

        for eve in file_list:
            pdf_reader = PdfFileReader(eve)
            for page in range(pdf_reader.getNumPages()):
                # 将每一页添加到writer对象中
                pdf_writer.addPage(pdf_reader.getPage(page))

        # 写入合并的pdf文件
        with open(self.processed + outpdf, "wb") as f:
            pdf_writer.write(f)

        # print("合并完成!")
        return "合并完成!"
Beispiel #27
0
def merge_pdfs(paths, output):
    """Combine individually downloaded dashboard files into a compile report. Optionally scales the PDF."""
    pdf_writer = PdfFileWriter()

    for path in paths:
        file_log.debug(f'path {path[1]} {path[0]}')
        pdf_reader = PdfFileReader(path[0])
        for idx in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(idx)
            if USE_SCALING:
              if path[1] == 'A4':
                  file_log.debug('scaling...')
                  page.scaleTo(812, 595) 
              else:
                  file_log.debug('merge as is...')
            pdf_writer.addPage(page)

    with open(output, 'wb') as out:
        pdf_writer.write(out)
Beispiel #28
0
def MergePDF(filepath):
    try:
        in_file_path = filepath + r"\input\\"
        pdf_fileName = getFileName(in_file_path)
        for pdfnames in pdf_fileName:
            output = PdfFileWriter()
            for pdfname in pdfnames:
                input = PdfFileReader(open(pdfname, "rb"))
                pageCount = input.getNumPages()
                for iPage in range(0, pageCount):
                    output.addPage(input.getPage(iPage))
            pdfoutname = str(pdfnames[0]).replace("input", "output")
            outputStream = open(pdfoutname, "wb")
            output.write(outputStream)
            outputStream.close()
        messagebox.showinfo("Complete!", "Complete!")
    except Exception as err:
        print("Something went wrong")
        print(err)
        sys.exit()
Beispiel #29
0
    def create_watermark(input_pdf, output, watermark):
        """
        添加水印
        :param input_pdf: 要加水印的PDF文件路径
        :param output: 要保存PDF的水印版本的路径
        :param watermark: 包含水印图像或文本的PDF
        :return: 
        """

        watermark_obj = PdfFileReader(watermark)
        watermark_page = watermark_obj.getPage(0)
        pdf_reader = PdfFileReader(input_pdf)
        pdf_writer = PdfFileWriter()
        # 给所有页面添加水印
        for page in range(pdf_reader.getNumPages()):
            page = pdf_reader.getPage(page)
            page.mergePage(watermark_page)
            pdf_writer.addPage(page)
        with open(output, 'wb') as out:
            pdf_writer.write(out)
Beispiel #30
0
def pdf_to_pdfs(pdf_folder, page_count):
    """
    将单页的pdf文件合并成整个文件
    :param pdf_folder: 
    :param page_count: 
    :return: 
    """
    # 创建一个pdf空白文档
    pdf_writer = PdfFileWriter()
    pdf_file_path = '{}/{}.pdf'.format(pdf_folder, 'combine')
    # 读取每页的pdf
    for page_index in range(page_count):
        page_pdf_file = '{}/{}.pdf'.format(pdf_folder, page_index)
        # 读取单页的pdf
        # 开始进行pdf 到 image的转换
        reader = PdfFileReader(page_pdf_file, strict=False)
        pdf_writer.addPage(reader.getPage(0))
        page_index += 1
    # 保存
    pdf_writer.write(open(pdf_file_path, 'wb'))