Example #1
0
def get_metadata(file, key):
    with open(file, 'rb') as pdf:
        pdf_reader = PdfFileReader(pdf)
        metadata = pdf_reader.getDocumentInfo()
        print(metadata)
        property = metadata.get('/' + key, 'Key Error')
    return property
Example #2
0
    def test_merge_pdf_output(self):

        image_paths = [
            'tests/pdf_samples/jpeg_w_350.jpg',
            'tests/pdf_samples/pdf_sample_A Sample PDF_loremIpsum_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_b_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_dummy_w3c_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_googledocs_image_pages_02.pdf',
            ## the next PDF fail to read - invalid literal for int() with base 10: b'F-1.4' !!!
            # 'tests/pdf_samples/pdf_sample_googlesheet_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_ISO19005_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_format_FDF_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_hibrid_format_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_libreoffice_exported_not_hybrid_ISO19005_pages_02.pdf',
            'tests/pdf_samples/pdf_sample_pages_01.pdf',
            ('tests/pdf_samples/pdf_sample_readthedocs_pdf_networkdays_pages_019.pdf', (0, 2)),
            'tests/pdf_samples/pdf_sample_text_edit_macos_pages_01.pdf',
            'tests/pdf_samples/pdf_sample_wikimedia_org_pages_01.pdf',
            'tests/pdf_samples/sample_pdf_commandline_xhtml2pdf_generated_pages_01.pdf',
            'tests/pdf_samples/issue_repo_pypdf4.pdf',
            'tests/pdf_samples/issue_repo_pypdf4_test.pdf',
        ]
        m = MergeToPdf(paths_list=image_paths, output_file_path='test_merged_pdf.pdf')
        m.merge_pdfs()
        with open('test_merged_pdf.pdf', "rb") as outputfile:
            generated_pdf = PdfFileReader(outputfile)
            pages = generated_pdf.getNumPages()

            self.assertEqual(pages, 23)
Example #3
0
    def merge(self):
        save_path = save_as_pdf()
        if not save_path:
            messagebox.showerror(MESSAGE_TITLE, "You must specify a file save path.")
            return

        if save_path[-4:].lower() != ".pdf":
            save_path += ".pdf"

        pdf_writer = PdfFileWriter()

        for item in self.tree.get_children():
            item_values = self.tree.item(item, option="values")
            path = item_values[1]
            pdf_reader = PdfFileReader(path)
            if pdf_reader.isEncrypted and not decrypt(pdf_reader, MESSAGE_TITLE):
                messagebox.showwarning(MESSAGE_TITLE, f"{item_values[0]} could not be decrypted. It will not be "
                                                      f"included in the merge.")
                continue

            for page in range(pdf_reader.getNumPages()):
                pdf_writer.addPage(pdf_reader.getPage(page))

        with Path(save_path).open(mode="wb") as save_file:
            pdf_writer.write(save_file)

        messagebox.showinfo(MESSAGE_TITLE, "PDF Merged")
def pdf_meta(filename):
    with open(filename,'rb') as pdf:
        pdfFile = PdfFileReader(pdf)
        doc = pdfFile.getDocumentInfo()
        print(f'[***] PDF MetaData: {str(filename)}')
        for item in doc:
            print(f'[++] {item} : {doc[item]}')
Example #5
0
    def extract_information(self, pdf_path, link):
        with open(pdf_path, 'rb') as f:
            pdf = PdfFileReader(f)
            information = pdf.getDocumentInfo()
            number_of_pages = pdf.getNumPages()

        try:
            readable1 = extract_date(
                information.getText("/CreationDate").split('-'))
            readable2 = extract_date(
                information.getText("/ModDate").split('-'))
        except:
            readable1 = None
            readable2 = None

        info = {
            "author": information.author,
            "creator": information.creator,
            "producer": information.producer,
            "subject": information.subject,
            "title": information.title,
            "creation_date": readable1,
            "modification_date": readable2,
            "number_of_pages": number_of_pages,
            "download_link": link
        }

        return pdf_path, info
Example #6
0
    def rotate_pdf(self, path: str, page_num: str, rotate_type: str,
                   outpdf: str):
        """
        旋转pdf页面
        path; 需要处理的pdf文件路径
        page_num: 页面编号
        rotate_type: 0或1,为顺时针或逆时针旋转
        outpdf: 输出pdf名称,不包含路径
        """
        pdf_writer = PdfFileWriter()
        pdf_reader = PdfFileReader(path)

        # 顺时针旋转90°
        if rotate_type == "0":
            page_1 = pdf_reader.getPage(int(page_num)).rotateClockwise(90)
            pdf_writer.addPage(page_1)

        elif rotate_type == "1":
            # 逆时针旋转90°
            page_2 = pdf_reader.getPage(
                int(page_num)).rotateCounterClockwise(90)
            pdf_writer.addPage(page_2)

        else:
            return "输入错误,请重新输入!"

        with open(self.processed + outpdf, "wb") as f:
            pdf_writer.write(f)

        # print("旋转页面完成!")
        return "旋转页面完成!"
Example #7
0
    def encrypt_file(self):
        path = self.file_selector.getpath()
        if not path:
            messagebox.showerror(MESSAGE_TITLE, "You must select a PDF file.")
            return

        if not self.password.get():
            messagebox.showerror(MESSAGE_TITLE, "You must enter a password.")
            return

        pdf_reader = PdfFileReader(path)
        if pdf_reader.isEncrypted:
            messagebox.showwarning(MESSAGE_TITLE, "File is already encrypted.")
            return

        pdf_writer = PdfFileWriter()

        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

        pdf_writer.encrypt(self.password.get())

        save_path = save_as_pdf(parent=self)
        if not save_path:
            messagebox.showerror(MESSAGE_TITLE,
                                 "You must specify a file save path")

        if save_path[-4:].lower() != ".pdf":
            save_path += ".pdf"

        with Path(save_path).open(mode="wb") as save_file:
            pdf_writer.write(save_file)

        messagebox.showinfo(MESSAGE_TITLE, "PDF encrypted.")
Example #8
0
def splitPages(testNameNum, testPath, pageRanges, outputDirs):
    temp = 1
    key = 0
    for bookmark in pageRanges:
        f = open(testPath, 'rb')
        pdf = PdfFileReader(f)
        pdfWriter = PdfFileWriter()

        #add watermark to original cover
        origCover = pdf.getPage(0)
        newCover = addWaterMark(origCover, key)

        #put new cover on the front
        pdfWriter.addPage(newCover)

        for page in range(temp, bookmark):
            pdfWriter.addPage(pdf.getPage(page))

        outputFileName = genSectionFilePath(outputDirs, testNameNum[1],
                                            testNameNum[0], key)
        with open(outputFileName, 'wb') as out:
            pdfWriter.write(out)
        temp = bookmark
        key += 1

    f.close()
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        prog='ca6fix',
        description=
        "Fix some disappointmented points in Computer Architecture Quantitative Approach 6th Edition Japanese translation PDF file.",
        usage='ca6fix -i ca6.pdf -o ca6_fixed.pdf',
        add_help=True)
    parser.add_argument('-i', '--input', help='input PDF file', required=True)
    parser.add_argument('-o',
                        '--output',
                        help='output PDF file',
                        required=True)
    args = parser.parse_args()

    reader = PdfFileReader(args.input)
    writer = PdfFileWriter()
    for p in range(reader.getNumPages()):
        page = reader.getPage(p)
        writer.addPage(page)

    writer.insertBlankPage(None, None, 4)

    for index in outline:
        add_outline(writer, index, 21)
    writer.setPageLayout('/TwoPageRight')
    writer.addMetadata({
        '/Title':
        'コンピュータアーキテクチャ 定量的アプローチ[第6版]',
        '/Author':
        'ジョン・L・ヘネシー, デイビッド・A・パターソン(著), 中條拓伯, 天野英晴, 鈴木 貢(訳)'
    })

    with open(args.output, 'wb') as fh:
        writer.write(fh)
Example #10
0
def imprimir_metadata(nombreArchivo):
    with open(nombreArchivo, 'rb') as pdf:
        pdf_file = PdfFileReader(pdf)
        doc_info = pdf_file.getDocumentInfo()
        print(f'[*] MetaData PDF para el archivo: {str(nombreArchivo)}')
        for meta_item in doc_info:
            print(f'[+] {meta_item}: {doc_info[meta_item]}')
Example #11
0
def pdf_to_txt(file):
    text = ''
    pdfReader = PdfFileReader(file)
    for i in range(pdfReader.numPages):
        page = pdfReader.getPage(i)
        text += page.extractText()
    return text.lower()
    def _merge_documents_PyPDF4(self, file_name, paths):
        """ Merge documents. """
        output = settings.SAVE_PATH / file_name
        try:
            pdf_writer = PdfFileWriter()

            for file_path in paths:
                if file_path:
                    pdf_reader = PdfFileReader(str(file_path), strict=False)

                    for page in range(pdf_reader.getNumPages()):
                        # Add each page to the writer object
                        pdf_writer.addPage(pdf_reader.getPage(page))

            # Write out the merged PDF
            output = settings.SAVE_PATH / file_name
            with open(output, 'wb') as out:
                pdf_writer.write(out)

            return output
        except utils.PdfReadError as error:
            LogHandler.execution_log(error=error)
            LogHandler.execution_log(
                error=f'ERROR ON: {output.name.replace(".PDF", "")}')

            return output
Example #13
0
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None):
    """
    Removes watermark from the pdf file.
    """
    pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False)
    pdf_writer = PdfFileWriter()
    for page in range(pdf_reader.getNumPages()):
        # If required for specific pages
        if pages:
            if str(page) not in pages:
                continue
        page = pdf_reader.getPage(page)
        # Get the page content
        content_object = page["/Contents"].getObject()
        content = ContentStream(content_object, pdf_reader)
        # Loop through all the elements page elements
        for operands, operator in content.operations:
            # Checks the TJ operator and replaces the corresponding string operand (Watermark text) with ''
            if operator == b_("Tj"):
                text = operands[0]
                if isinstance(text, str) and text.startswith(wm_text):
                    operands[0] = TextStringObject('')
        page.__setitem__(NameObject('/Contents'), content)
        pdf_writer.addPage(page)
    return True, pdf_reader, pdf_writer
Example #14
0
def split_pages(testnamenum, test_path, page_ranges):
	temp = 1
	key = 0
	for bookmark in page_ranges:
		f = open(test_path, 'rb')
		pdf = PdfFileReader(f)
		pdf_writer = PdfFileWriter()
		
		#add watermark to original cover
		orig_cover = pdf.getPage(0)
		new_cover = add_watermark(orig_cover, key)

		#put new cover on the front
		pdf_writer.addPage(new_cover)

		for page in range(temp, bookmark):
			pdf_writer.addPage(pdf.getPage(page))

		local_filename = generate_section_filepath(CONST_LOCAL, testnamenum, key)
		# output_filename = generate_section_filepath(output_dirs, testnamenum, key)

		with open(local_filename, 'wb') as out:
			pdf_writer.write(out)

		# upload_dropbox(local_filename, output_filename)

		temp = bookmark
		key += 1

	f.close()
Example #15
0
def split(input, destination, pages, format, verbosity, **kwargs):
    """split pdf into single page file.

pdfcli split document.pdf --format page-%02d.pd -p 1,10-20

"""
    source = PdfFileReader(input)
    if pages is None:
        pages = range(1, source.numPages + 1)

    to_dir = Path(destination)
    if not to_dir.exists():
        to_dir.mkdir(parents=True)

    for page_num in pages:
        real_page = page_num - 1
        if verbosity >= 1:
            click.echo("Extracting page %s" % page_num)
        # due to a bug PyPDF4 file need to be reopened
        source = PdfFileReader(input)
        dest_file = (to_dir / Path(format % page_num)).absolute()
        page = source.getPage(real_page)
        output_pdf = PdfFileWriter()
        output_pdf.addPage(page)
        with open(str(dest_file), "wb") as f:
            output_pdf.write(f)
Example #16
0
def rotate(input, output, pages, verbosity, rotate, **kwargs):
    """rotate selected pages

Rotate selected pages and outputs in new pdf
"""
    source = PdfFileReader(input)

    angle = {'left': -90, 'right': 90, 'inverted': 180}[rotate]
    if pages is None:
        pages = range(1, source.numPages)

    selection = []
    for page_num in range(1, source.getNumPages()):
        real_page = page_num - 1
        if verbosity >= 1:
            click.echo(".", nl=False)
        if verbosity >= 2:
            click.echo("Extracting page %s" % page_num)
        page = source.getPage(real_page)
        if page_num in pages:
            page._rotate(angle)
        selection.append(page)

    output_pdf = PdfFileWriter()
    for page in selection:
        output_pdf.addPage(page)

    if verbosity >= 1:
        click.echo("Writing %s" % output.name)
    output_pdf.write(output)
Example #17
0
async def add_watermark(file_path,stage,fileno):
    """把水印添加到pdf中"""
    #print('文件开始...' + str(datetime.now()))
    
    pdf_input = PdfFileReader(file_path)
    if pdf_input.isEncrypted:
        return 
    pdf_info = pdf_input.getDocumentInfo()
    w, h = pdf_input.getPage(0).mediaBox[2:]
    # 页面尺寸转换为毫米
    page = (int(w)*0.3528, int(h)*0.3528)

    # 创建水印文件
    #create_watermark(page,stage,fileno)
    mark = create_watermark(page, stage, fileno)
    #mark=await asyncio.get_event_loop().run_in_executor(None, create_watermark, page, stage, fileno)
    # 读入水印pdf文件
    #mark='d:/mark.pdf'    
    
    pdf_output = await asyncio.get_event_loop().run_in_executor(None, merge, pdf_input, mark)
    #pdf_output = merge(file_path, mark)
    # 加密码
    pdf_output.encrypt(user_pwd='', owner_pwd='12345',use_128bit=True)
    pdf_output.addMetadata(pdf_info)
    # 可以更改一些属性值
    #pdf_output.addMetadata(info)
    #savepdf(pdf_output, file_path)
    await asyncio.get_event_loop().run_in_executor(None, savepdf, pdf_output, file_path)    
Example #18
0
def print_meta(filename):
    with open(filename, 'rb') as pdf:
        pdf_file = PdfFileReader(pdf)
        doc_info = pdf_file.getDocumentInfo()
        print(f'[*] PDF MetaData For: {str(filename)}')
        for meta_item in doc_info:
            print(f'[+] {meta_item}: {doc_info[meta_item]}')
Example #19
0
def extract(input, output, pages, verbosity, **kwargs):
    """extract one or multiple pages and build a new document.

pdfcli extract source.pdf -o clear.pdf -p 1,3-5


"""
    source = PdfFileReader(input)

    if pages is None:
        pages = range(1, source.numPages)

    selection = []
    for page_num in pages:
        real_page = page_num - 1
        if verbosity >= 1:
            click.echo(".", nl=False)
        if verbosity >= 2:
            click.echo("Extracting page %s" % page_num)

        selection.append(source.getPage(real_page))

    output_pdf = PdfFileWriter()
    for page in selection:
        output_pdf.addPage(page)

    if verbosity >= 1:
        click.echo("Writing %s" % output.name)
    output_pdf.write(output)
Example #20
0
    def splitPdf(path='./input.pdf', N=5):
        if not os.path.isfile(path):
            return

        pdfFileWriter = PdfFileWriter()
        pdfFileReader = PdfFileReader(path)  # 获取 PdfFileReader 对象
        # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))

        numPages = pdfFileReader.getNumPages()  # 文档总页数
        print(numPages)

        # fname = os.path.splitext(os.path.basename(path))[0]
        fname = os.path.splitext(path)[0]
        outFile1 = '{}p1_{}.pdf'.format(fname, N)
        outFile2 = '{}p{}_{}.pdf'.format(fname, N + 1, numPages)
        print(outFile1, outFile2)

        if numPages > N:
            # 第N页之前的页面,输出到一个新的文件中,即分割文档
            for index in range(N - 1):
                pageObj = pdfFileReader.getPage(index)
                pdfFileWriter.addPage(pageObj)
            # 添加完每页,再一起保存至文件中
            pdfFileWriter.write(open(outFile1, 'wb'))

            # 从第N页之后的页面,输出到一个新的文件中,即分割文档
            for index in range(N, numPages):
                pageObj = pdfFileReader.getPage(index)
                pdfFileWriter.addPage(pageObj)
            # 添加完每页,再一起保存至文件中
            pdfFileWriter.write(open(outFile2, 'wb'))
Example #21
0
def make_booklet(input_name, output_name, blanks=0):
    reader = PdfFileReader(open(input_name, "rb"))
    pages = [reader.getPage(p) for p in range(0, reader.getNumPages())]
    for i in range(0, blanks):
        pages.insert(0, None)

    sheets = build_booklet(pages)

    writer = PdfFileWriter()
    p0 = reader.getPage(0)
    input_width = p0.mediaBox.getWidth()
    output_width = input_width * 2
    input_height = p0.mediaBox.getHeight()
    output_height = input_height

    page_size = (output_width, output_height)
    # We want to group fronts and backs together.
    for sheet in sheets:
        add_double_page(writer, page_size, sheet.back)

    for sheet in sheets:
        add_double_page(writer, page_size, sheet.front)

    writer.write(open(output_name, "wb"))
    print_instructions(sheets)
Example #22
0
def join(ctx, inputs, output, verbosity, **kwargs):
    """join multiple pdf together in a single file.

pdfcli join files*.pdf -o joined.pdf

"""
    if not inputs:
        click.echo("No input files")
        ctx.exit(1)

    for input in inputs:
        if not Path(input).exists():
            if verbosity >= 1:
                click.echo("File not found '%s'" % input, err=True)
                ctx.exit(1)

    out = PdfFileWriter()

    for input in inputs:
        source = PdfFileReader(input)
        if verbosity >= 1:
            click.echo("Adding %s" % input)
        for page_num in range(0, source.numPages):
            out.addPage(source.getPage(page_num))

    out.write(output)
    if verbosity >= 1:
        click.echo("Writing %s" % output.name)
Example #23
0
def generate_pdf(template_name, request, contexte, extra_pdf_files=None):
    template = get_template(template_name)
    contexte.update({
        'MEDIA_ROOT': settings.MEDIA_ROOT,
        'cdate': now(),
        'user': request.user
    })
    html = template.render(contexte)

    try:
        result = BytesIO()
        pisa_status = pisa.CreatePDF(html, result)
        if not pisa_status.err:
            if extra_pdf_files:
                from PyPDF4 import PdfFileWriter, PdfFileReader
                output = PdfFileWriter()
                append_pdf(PdfFileReader(result), output)
                result = BytesIO()
                for pdf_file in extra_pdf_files:
                    try:
                        append_pdf(PdfFileReader(pdf_file), output)
                    except Exception:
                        return render(request, "pdf_error.html", {
                            'pdf': pdf_file,
                            'error': traceback.format_exc()
                        })
                output.write(result)
            return http.HttpResponse(result.getvalue(),
                                     content_type='application/pdf')
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.exception(e)
    return http.HttpResponse('Gremlins ate your pdf! %s' % cgi.escape(html))
def remove_watermark(wm_text, inputFile, outputFile):
    from PyPDF4 import PdfFileReader, PdfFileWriter
    from PyPDF4.pdf import ContentStream
    from PyPDF4.generic import TextStringObject, NameObject
    from PyPDF4.utils import b_

    with open(inputFile, "rb") as f:
        source = PdfFileReader(f, "rb")
        output = PdfFileWriter()

        for page in range(source.getNumPages()):
            page = source.getPage(page)
            content_object = page["/Contents"].getObject()
            content = ContentStream(content_object, source)

            for operands, operator in content.operations:
                if operator == b_("Tj"):
                    text = operands[0]

                    if isinstance(text, str) and text.startswith(wm_text):
                        operands[0] = TextStringObject('')

            page.__setitem__(NameObject('/Contents'), content)
            output.addPage(page)

        with open(outputFile, "wb") as outputStream:
            output.write(outputStream)
Example #25
0
def get_info(input_file: str):
    """
    Extracting the file info
    """
    # If PDF is encrypted the file metadata cannot be extracted
    with open(input_file, 'rb') as pdf_file:
        pdf_reader = PdfFileReader(pdf_file, strict=False)
        output = {
            "File": input_file,
            "Encrypted": ("True" if pdf_reader.isEncrypted else "False")
        }
        if not pdf_reader.isEncrypted:
            info = pdf_reader.getDocumentInfo()
            num_pages = pdf_reader.getNumPages()
            output["Author"] = info.author
            output["Creator"] = info.creator
            output["Producer"] = info.producer
            output["Subject"] = info.subject
            output["Title"] = info.title
            output["Number of pages"] = num_pages
    # To Display collected metadata
    print(
        "## File Information ##################################################"
    )
    print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
    print(
        "######################################################################"
    )
    return True, output
Example #26
0
def rotate_pdf(path, degrees, output):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(path)

    for page in range(pdf_reader.getNumPages()):
        original = pdf_reader.getPage(page)
        pdf_writer.addPage(original.rotateClockwise(int(degrees)))
    pdf_writer.write(output)
Example #27
0
def readWritePdf():
    with open("./Python 面试题.pdf", "rb") as f:
        pdfReader = PdfFileReader(f)
        pdfWriter = PdfFileWriter()
        page = pdfReader.getPage(0)
        pdfWriter.addPage(page)
        with open("./new.pdf", "wb") as f1:
            pdfWriter.write(f1)
Example #28
0
def merge(pdf_input, mark):    
    pageNum = pdf_input.getNumPages()
    pdf_output = PdfFileWriter()
    pdf_watermark = PdfFileReader(open(mark, 'rb'), strict=False)
    for i in range(pageNum):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))        
        pdf_output.addPage(page)    
    return pdf_output
Example #29
0
 def on_file_selected(self):
     if self.file_selector.getpath():
         pdf_reader = PdfFileReader(self.file_selector.getpath())
         if pdf_reader.isEncrypted and not decrypt(pdf_reader,
                                                   MESSAGE_TITLE):
             self.page_count_text.set("")
             self.file_selector.clear()
         else:
             self.page_count_text.set(pdf_reader.getNumPages())
Example #30
0
 def extract(fileobj):
     pfr = PdfFileReader(fileobj, strict=False)
     text = "" if fmt == "string" else []
     for pg in range(pfr.getNumPages()):
         if fmt == "string":
             text += pfr.getPage(pg).extractText()
         else:
             text.append(pfr.getPage(pg).extractText())
     return text