Example #1
0
    def extract_information(self, pdf_path, link):
        with open(pdf_path, 'rb') as f:
            pdf = PdfFileReader(f)
            information = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        try:
            readable1 = extract_date(
                information.getText("/CreationDate").split('-'))
            readable2 = extract_date(
                information.getText("/ModDate").split('-'))
        except:
            readable1 = None
            readable2 = None

        info = {
            "author": information.author,
            "creator": information.creator,
            "producer": information.producer,
            "subject": information.subject,
            "title": information.title,
            "creation_date": readable1,
            "modification_date": readable2,
            "number_of_pages": number_of_pages,
            "download_link": link
        }

        return pdf_path, info
Example #2
0
    def merge(self):
        save_path = save_as_pdf()
        if not save_path:
            messagebox.showerror(MESSAGE_TITLE, "You must specify a file save path.")
            return

        if save_path[-4:].lower() != ".pdf":
            save_path += ".pdf"

        pdf_writer = PdfFileWriter()

        for item in self.tree.get_children():
            item_values = self.tree.item(item, option="values")
            path = item_values[1]
            pdf_reader = PdfFileReader(path)
            if pdf_reader.isEncrypted and not decrypt(pdf_reader, MESSAGE_TITLE):
                messagebox.showwarning(MESSAGE_TITLE, f"{item_values[0]} could not be decrypted. It will not be "
                                                      f"included in the merge.")
                continue

            for page in range(pdf_reader.getNumPages()):
                pdf_writer.addPage(pdf_reader.getPage(page))

        with Path(save_path).open(mode="wb") as save_file:
            pdf_writer.write(save_file)

        messagebox.showinfo(MESSAGE_TITLE, "PDF Merged")
Example #3
0
def get_info(input_file: str):
    """
    Extracting the file info
    """
    # If PDF is encrypted the file metadata cannot be extracted
    with open(input_file, 'rb') as pdf_file:
        pdf_reader = PdfFileReader(pdf_file, strict=False)
        output = {
            "File": input_file,
            "Encrypted": ("True" if pdf_reader.isEncrypted else "False")
        }
        if not pdf_reader.isEncrypted:
            info = pdf_reader.getDocumentInfo()
            num_pages = pdf_reader.getNumPages()
            output["Author"] = info.author
            output["Creator"] = info.creator
            output["Producer"] = info.producer
            output["Subject"] = info.subject
            output["Title"] = info.title
            output["Number of pages"] = num_pages
    # To Display collected metadata
    print(
        "## File Information ##################################################"
    )
    print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
    print(
        "######################################################################"
    )
    return True, output
Example #4
0
    def test_merge_pdf_output(self):

        image_paths = [
            'tests/pdf_samples/jpeg_w_350.jpg',
            'tests/pdf_samples/pdf_sample_A Sample PDF_loremIpsum_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_b_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_dummy_w3c_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_googledocs_image_pages_02.pdf',
            ## the next PDF fail to read - invalid literal for int() with base 10: b'F-1.4' !!!
            # 'tests/pdf_samples/pdf_sample_googlesheet_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_ISO19005_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_format_FDF_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_hibrid_format_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_not_hybrid_ISO19005_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_pages_01.pdf',
            ('tests/pdf_samples/pdf_sample_readthedocs_pdf_networkdays_pages_019.pdf', (0, 2)),
            'tests/pdf_samples/pdf_sample_text_edit_macos_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_wikimedia_org_pages_01.pdf',
            'tests/pdf_samples/sample_pdf_commandline_xhtml2pdf_generated_pages_01.pdf',
            'tests/pdf_samples/issue_repo_pypdf4.pdf',
            'tests/pdf_samples/issue_repo_pypdf4_test.pdf',
        ]
        m = MergeToPdf(paths_list=image_paths, output_file_path='test_merged_pdf.pdf')
        m.merge_pdfs()
        with open('test_merged_pdf.pdf', "rb") as outputfile:
            generated_pdf = PdfFileReader(outputfile)
            pages = generated_pdf.getNumPages()

            self.assertEqual(pages, 23)
Example #5
0
def put_watermark(input_pdf, output_pdf, watermark):
    # reads the watermark pdf file through
    # PdfFileReader
    watermark_instance = PdfFileReader(watermark)

    # fetches the respective page of
    # watermark(1st page)
    watermark_page = watermark_instance.getPage(0)

    # reads the input pdf file
    pdf_reader = PdfFileReader(input_pdf)

    # It creates a pdf writer object for the
    # output file
    pdf_writer = PdfFileWriter()

    # iterates through the original pdf to
    # merge watermarks
    for page in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page)

        # will overlay the watermark_page on top
        # of the current page.
        page.mergePage(watermark_page)

        # add that newly merged page to the
        # pdf_writer object.
        pdf_writer.addPage(page)

    with open(output_pdf, 'wb') as out:
        # writes to the respective output_pdf provided
        pdf_writer.write(out)
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        prog='ca6fix',
        description=
        "Fix some disappointmented points in Computer Architecture Quantitative Approach 6th Edition Japanese translation PDF file.",
        usage='ca6fix -i ca6.pdf -o ca6_fixed.pdf',
        add_help=True)
    parser.add_argument('-i', '--input', help='input PDF file', required=True)
    parser.add_argument('-o',
                        '--output',
                        help='output PDF file',
                        required=True)
    args = parser.parse_args()

    reader = PdfFileReader(args.input)
    writer = PdfFileWriter()
    for p in range(reader.getNumPages()):
        page = reader.getPage(p)
        writer.addPage(page)

    writer.insertBlankPage(None, None, 4)

    for index in outline:
        add_outline(writer, index, 21)
    writer.setPageLayout('/TwoPageRight')
    writer.addMetadata({
        '/Title':
        'コンピュータアーキテクチャ 定量的アプローチ[第6版]',
        '/Author':
        'ジョン・L・ヘネシー, デイビッド・A・パターソン(著), 中條拓伯, 天野英晴, 鈴木 貢(訳)'
    })

    with open(args.output, 'wb') as fh:
        writer.write(fh)
Example #7
0
def add_watermark(file_path, file_stage, fileno):
    """把水印添加到pdf中"""

    pdf_input = PdfFileReader(file_path)
    if pdf_input.isEncrypted:
        return
    pdf_info = pdf_input.getDocumentInfo()
    w, h = pdf_input.getPage(0).mediaBox[2:]
    # 页面尺寸转换为毫米
    page = (int(w) * 0.3528, int(h) * 0.3528)

    # 创建水印文件
    #mark = str(int(time.time()))+'.pdf'
    #mark = os.path.join('d:/', mark)

    mark = create_watermark(page, file_stage, fileno)

    # 读入水印pdf文件
    pageNum = pdf_input.getNumPages()
    pdf_output = PdfFileWriter()
    pdf_watermark = PdfFileReader(open(mark, 'rb'), strict=False)
    for i in range(pageNum):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        pdf_output.addPage(page)

    #print('merg 结束...' + str(datetime.now()))

    # 加密码
    pdf_output.encrypt(user_pwd='', owner_pwd='12345', use_128bit=True)
    pdf_output.addMetadata(pdf_info)

    pdf_output.write(open(file_path, 'wb'))
def remove_watermark(wm_text, inputFile, outputFile):
    from PyPDF4 import PdfFileReader, PdfFileWriter
    from PyPDF4.pdf import ContentStream
    from PyPDF4.generic import TextStringObject, NameObject
    from PyPDF4.utils import b_

    with open(inputFile, "rb") as f:
        source = PdfFileReader(f, "rb")
        output = PdfFileWriter()

        for page in range(source.getNumPages()):
            page = source.getPage(page)
            content_object = page["/Contents"].getObject()
            content = ContentStream(content_object, source)

            for operands, operator in content.operations:
                if operator == b_("Tj"):
                    text = operands[0]

                    if isinstance(text, str) and text.startswith(wm_text):
                        operands[0] = TextStringObject('')

            page.__setitem__(NameObject('/Contents'), content)
            output.addPage(page)

        with open(outputFile, "wb") as outputStream:
            output.write(outputStream)
    def _merge_documents_PyPDF4(self, file_name, paths):
        """ Merge documents. """
        output = settings.SAVE_PATH / file_name
        try:
            pdf_writer = PdfFileWriter()

            for file_path in paths:
                if file_path:
                    pdf_reader = PdfFileReader(str(file_path), strict=False)

                    for page in range(pdf_reader.getNumPages()):
                        # Add each page to the writer object
                        pdf_writer.addPage(pdf_reader.getPage(page))

            # Write out the merged PDF
            output = settings.SAVE_PATH / file_name
            with open(output, 'wb') as out:
                pdf_writer.write(out)

            return output
        except utils.PdfReadError as error:
            LogHandler.execution_log(error=error)
            LogHandler.execution_log(
                error=f'ERROR ON: {output.name.replace(".PDF", "")}')

            return output
Example #10
0
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None):
    """
    Removes watermark from the pdf file.
    """
    pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False)
    pdf_writer = PdfFileWriter()
    for page in range(pdf_reader.getNumPages()):
        # If required for specific pages
        if pages:
            if str(page) not in pages:
                continue
        page = pdf_reader.getPage(page)
        # Get the page content
        content_object = page["/Contents"].getObject()
        content = ContentStream(content_object, pdf_reader)
        # Loop through all the elements page elements
        for operands, operator in content.operations:
            # Checks the TJ operator and replaces the corresponding string operand (Watermark text) with ''
            if operator == b_("Tj"):
                text = operands[0]
                if isinstance(text, str) and text.startswith(wm_text):
                    operands[0] = TextStringObject('')
        page.__setitem__(NameObject('/Contents'), content)
        pdf_writer.addPage(page)
    return True, pdf_reader, pdf_writer
Example #11
0
    def encrypt_file(self):
        path = self.file_selector.getpath()
        if not path:
            messagebox.showerror(MESSAGE_TITLE, "You must select a PDF file.")
            return

        if not self.password.get():
            messagebox.showerror(MESSAGE_TITLE, "You must enter a password.")
            return

        pdf_reader = PdfFileReader(path)
        if pdf_reader.isEncrypted:
            messagebox.showwarning(MESSAGE_TITLE, "File is already encrypted.")
            return

        pdf_writer = PdfFileWriter()

        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

        pdf_writer.encrypt(self.password.get())

        save_path = save_as_pdf(parent=self)
        if not save_path:
            messagebox.showerror(MESSAGE_TITLE,
                                 "You must specify a file save path")

        if save_path[-4:].lower() != ".pdf":
            save_path += ".pdf"

        with Path(save_path).open(mode="wb") as save_file:
            pdf_writer.write(save_file)

        messagebox.showinfo(MESSAGE_TITLE, "PDF encrypted.")
Example #12
0
def make_booklet(input_name, output_name, blanks=0):
    reader = PdfFileReader(open(input_name, "rb"))
    pages = [reader.getPage(p) for p in range(0, reader.getNumPages())]
    for i in range(0, blanks):
        pages.insert(0, None)

    sheets = build_booklet(pages)

    writer = PdfFileWriter()
    p0 = reader.getPage(0)
    input_width = p0.mediaBox.getWidth()
    output_width = input_width * 2
    input_height = p0.mediaBox.getHeight()
    output_height = input_height

    page_size = (output_width, output_height)
    # We want to group fronts and backs together.
    for sheet in sheets:
        add_double_page(writer, page_size, sheet.back)

    for sheet in sheets:
        add_double_page(writer, page_size, sheet.front)

    writer.write(open(output_name, "wb"))
    print_instructions(sheets)
Example #13
0
def rotate(input, output, pages, verbosity, rotate, **kwargs):
    """rotate selected pages

Rotate selected pages and outputs in new pdf
"""
    source = PdfFileReader(input)

    angle = {'left': -90, 'right': 90, 'inverted': 180}[rotate]
    if pages is None:
        pages = range(1, source.numPages)

    selection = []
    for page_num in range(1, source.getNumPages()):
        real_page = page_num - 1
        if verbosity >= 1:
            click.echo(".", nl=False)
        if verbosity >= 2:
            click.echo("Extracting page %s" % page_num)
        page = source.getPage(real_page)
        if page_num in pages:
            page._rotate(angle)
        selection.append(page)

    output_pdf = PdfFileWriter()
    for page in selection:
        output_pdf.addPage(page)

    if verbosity >= 1:
        click.echo("Writing %s" % output.name)
    output_pdf.write(output)
Example #14
0
def ohin(obj_pages: List[int], save_path: str, input_path: str, img_path: str, position: List[float], img_size: List[float], pdf_size: str = "A4") -> None:

    overlay_pdf_maker("overlay.pdf", img_path, position, img_size, pdf_size)

    f_overlay = open("overlay.pdf", 'rb')
    overlay = PdfFileReader(f_overlay).getPage(0)

    f_target = open(input_path, 'rb')
    reader = PdfFileReader(f_target)
    num_pages = reader.getNumPages()

    assert num_pages >= max(obj_pages)

    for p in obj_pages:
        page = reader.getPage(p - 1)
        page.mergePage(overlay)

    writer = PdfFileWriter()
    for p in range(num_pages):
        page = reader.getPage(p)
        writer.addPage(page)

    with open(save_path, 'wb') as f:
        writer.write(f)

    f_overlay.close()
    f_target.close()

    os.remove("overlay.pdf")
Example #15
0
    def splitPdf(path='./input.pdf', N=5):
        if not os.path.isfile(path):
            return

        pdfFileWriter = PdfFileWriter()
        pdfFileReader = PdfFileReader(path)  # 获取 PdfFileReader 对象
        # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))

        numPages = pdfFileReader.getNumPages()  # 文档总页数
        print(numPages)

        # fname = os.path.splitext(os.path.basename(path))[0]
        fname = os.path.splitext(path)[0]
        outFile1 = '{}p1_{}.pdf'.format(fname, N)
        outFile2 = '{}p{}_{}.pdf'.format(fname, N + 1, numPages)
        print(outFile1, outFile2)

        if numPages > N:
            # 第N页之前的页面,输出到一个新的文件中,即分割文档
            for index in range(N - 1):
                pageObj = pdfFileReader.getPage(index)
                pdfFileWriter.addPage(pageObj)
            # 添加完每页,再一起保存至文件中
            pdfFileWriter.write(open(outFile1, 'wb'))

            # 从第N页之后的页面,输出到一个新的文件中,即分割文档
            for index in range(N, numPages):
                pageObj = pdfFileReader.getPage(index)
                pdfFileWriter.addPage(pageObj)
            # 添加完每页,再一起保存至文件中
            pdfFileWriter.write(open(outFile2, 'wb'))
Example #16
0
    def readPdf(readFile='./input.pdf'):
        # 获取 PdfFileReader 对象
        pdfFileReader = PdfFileReader(readFile)
        # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))
        # 获取 PDF 文件的文档信息
        documentInfo = pdfFileReader.getDocumentInfo()
        print('documentInfo = %s' % documentInfo)
        # 获取页面布局
        pageLayout = pdfFileReader.getPageLayout()
        print('pageLayout = %s ' % pageLayout)

        # 获取页模式
        pageMode = pdfFileReader.getPageMode()
        print('pageMode = %s' % pageMode)

        xmpMetadata = pdfFileReader.getXmpMetadata()
        print('xmpMetadata  = %s ' % xmpMetadata)

        # 获取 pdf 文件页数
        pageCount = pdfFileReader.getNumPages()

        print('pageCount = %s' % pageCount)
        for index in range(0, pageCount):
            # 返回指定页编号的 pageObject
            pageObj = pdfFileReader.getPage(index)
            print('index = %d , pageObj = %s' % (index, type(pageObj)))
            # <class 'PyPDF2.pdf.PageObject'>
            # 获取 pageObject 在 PDF 文档中处于的页码
            pageNumber = pdfFileReader.getPageNumber(pageObj)
            print('pageNumber = %s ' % pageNumber)
Example #17
0
def rotate_pdf(path, degrees, output):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(path)

    for page in range(pdf_reader.getNumPages()):
        original = pdf_reader.getPage(page)
        pdf_writer.addPage(original.rotateClockwise(int(degrees)))
    pdf_writer.write(output)
Example #18
0
def merge_pdfs(pdf1_path, pdf2_path):
    pdf_writer = PdfFileWriter()
    pdf1_reader = PdfFileReader(pdf1_path, 'rb')
    pdf2_reader = PdfFileReader(pdf2_path, 'rb')

    for i in range(pdf1_reader.getNumPages()):
        page = pdf1_reader.getPage(i)
        pdf_writer.addPage(page)

    for i in range(pdf2_reader.getNumPages()):
        page = pdf2_reader.getPage(i)
        pdf_writer.addPage(page)

    with open('Resultant_PDF_After_Merging.pdf', 'wb') as fh:
        pdf_writer.write(fh)

    print("Merged the pdfs '" + path1 + "', and '" + path2 + "'.")
Example #19
0
def join_pdfs(paths, output):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

        pdf_writer.write(output)
Example #20
0
 def on_file_selected(self):
     if self.file_selector.getpath():
         pdf_reader = PdfFileReader(self.file_selector.getpath())
         if pdf_reader.isEncrypted and not decrypt(pdf_reader,
                                                   MESSAGE_TITLE):
             self.page_count_text.set("")
             self.file_selector.clear()
         else:
             self.page_count_text.set(pdf_reader.getNumPages())
Example #21
0
 def extract(fileobj):
     pfr = PdfFileReader(fileobj, strict=False)
     text = "" if fmt == "string" else []
     for pg in range(pfr.getNumPages()):
         if fmt == "string":
             text += pfr.getPage(pg).extractText()
         else:
             text.append(pfr.getPage(pg).extractText())
     return text
def extract_text(pdf_path):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(pdf_path, 'rb')
    f = open('Text_Output.txt', 'w')
    i = 0
    for i in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(i)
        content = page.extractText()
        f.write(content)
def extract_pdf_text(path, format="string"):
    with open(path, "rb") as fileobj:
        pfr = PdfFileReader(fileobj)
        text = "" if format == "string" else []
        for pg in range(pfr.getNumPages()):
            if format == "string":
                text += pfr.getPage(pg).extractText()
            else:
                text.append(pfr.getPage(pg).extractText())
        return text
Example #24
0
def pdf_appendfile(inpath: str, appendpath: str, page_no: int, outpath: str):
    fname = os.path.splitext(os.path.basename(inpath))[0]
    if not outpath:
        outpath = '{}_output.pdf'.format(fname)

    pdf = PdfFileReader(inpath)
    pdf_a = PdfFileReader(appendpath)
    pdf_writer = PdfFileWriter()

    with open(outpath, 'wb') as out:
        for page in range(pdf.getNumPages()):
            pdf_writer.addPage(pdf.getPage(page))
            if page + 1 == int(page_no):
                for page_a in range(pdf_a.getNumPages()):
                    pdf_writer.addPage(pdf_a.getPage(page_a))

        pdf_writer.write(out)

    print('Created: {}'.format(outpath))
Example #25
0
def extractPdfPage(path, pageIndex, extractedPdfName):
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        if page != pageIndex:
            continue
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))

        with open(extractedPdfName, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)
Example #26
0
def mergePdfFiles(paths, outputPath):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

    with open(outputPath, 'wb') as out:
        pdf_writer.write(out)
def split_to_single_pages(path, name_of_split):
    """ Splits one document into single pages
    """
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))

        output = f'{name_of_split}{page}.pdf'
        with open(output, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)
def add_pdf_subset(pdf_writer, input_path, page_start, page_end):
    my_pdf_writer = pdf_writer

    pdf_reader = PdfFileReader(input_path)

    if pdf_reader.getNumPages() < page_end:
        Exception("too few pages")

    for page in range(page_start, page_end):
        # Add each page to the writer object
        my_pdf_writer.addPage(pdf_reader.getPage(page))
Example #29
0
def getPubFile(url):
    try:
        filename = wget.download(url)
    except HTTPError as e:
        logger.debug('Download error:', e.code, e.read())
    else:
        pdf = PdfFileReader(open(filename, "rb"))
        content = ""
        for i in range(0, pdf.getNumPages()):
            content += pdf.getPage(i).extractText() + " \n"
        return content
Example #30
0
def merge_pdfs(paths, output):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            # Add each page to the writer object
            pdf_writer.addPage(pdf_reader.getPage(page))
    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)