Esempio n. 1
0
    def split(file):
        """
        Esse método irá separar página por página do arquivo que o usuário escolher e as salvar no 'output directory'
        como novos arquivos pdf. Cada arquivo corresponderá à uma página do documento original.
        :param file: O arquivo escolhido pelo usuário para fazer a separação das páginas
        :return: None
        """
        # Limpando o diretório para evitar duplicidade em arquivos/diretórios
        Splitter.cleanDir()

        # Tratando o nome do arquivo
        file = Merger.toPath(file)

        # Lógica para separação das páginas dos arquivos PDF's e nova nomeclatura para os mesmos
        with open(file, mode='rb') as pdf_file_to_read:
            file_length = PdfFileReader(pdf_file_to_read).numPages

            for page in range(file_length):
                pdf_file = PdfFileReader(pdf_file_to_read)
                current_page = PdfFileWriter()
                current_page.addPage(pdf_file.getPage(page))
                with open(join(Splitter.splitter_dir,
                               f"página_{page + 1}.pdf"),
                          mode='wb') as pdf:
                    current_page.write(pdf)
Esempio n. 2
0
def PDFMerge(savePath, pdfPath, watermarkPdfPath):
    # pdf파일 불러오기
    pdfFile = open(pdfPath, 'rb')
    pdfReader = PdfFileReader(pdfFile, strict=False)

    # 워터마크 PDF파일 불러오기
    watermarkPdfFile = open(watermarkPdfPath, 'rb')
    watermarkPdf = PdfFileReader(watermarkPdfFile, strict=False).getPage(0)

    pdfWriter = PdfFileWriter()

    #PDF 페이지 수만큼 반복
    for pageNum in range(pdfReader.numPages):

        #페이지를 불러온다
        pageObj = pdfReader.getPage(pageNum)

        #중앙으로 놓기 위해 좌표를 구한다
        x = (pageObj.mediaBox[2] - watermarkPdf.mediaBox[2]) / 2
        y = (pageObj.mediaBox[3] - watermarkPdf.mediaBox[3]) / 2

        # 워터마크페이지와 합친다
        pageObj.mergeTranslatedPage(page2=watermarkPdf,
                                    tx=x,
                                    ty=y,
                                    expand=False)

        #합친걸 저장할 PDF파일에 추가한다
        pdfWriter.addPage(pageObj)

    #저장
    resultFile = open(savePath, 'wb')
    pdfWriter.write(resultFile)
Esempio n. 3
0
def check_file_content(original, converted):
    original_pdf = PdfFileReader(open(original, mode="rb"), strict=False)
    original_page_num = original_pdf.numPages

    with open(converted, mode="rb") as converted_data:
        converted_pdf = PdfFileReader(converted_data, strict=False)
        page = PageObject(converted_data)
        if (page.getContents() is None or original_page_num != converted_pdf.numPages):
            return False
    return True
Esempio n. 4
0
def pypdf3_reader(pdf, decrypt=None):
    """
    Retrieve a PdfFileReader object that has been decrypted if a password is specified.

    :param pdf: PDF document to read
    :param decrypt: Owner password to decrypt pdf
    :return: PdfFileReader object
    """
    if decrypt:
        reader = PdfFileReader(pdf)
        reader.decrypt(decrypt)
        return reader
    else:
        return PdfFileReader(pdf)
    def browse_PDF(self):
        try:
            global filename
            self.filename=QtWidgets.QFileDialog.getOpenFileName(None,'Select PDF File','/','PDF File (*.pdf)')

            if self.filename[0] and PdfFileReader(self.filename[0]).getIsEncrypted()==False:
                self.convertPDF.setDisabled(False)
            else:
                if PdfFileReader(self.filename[0]).getIsEncrypted()==False:
                    msg=QMessageBox()
                    msg.setWindowTitle('Error')
                    msg.setText('File curropted or encrypted.')
                    msg.setIcon(3)
                    msg.exec_()
        except:
            pass
Esempio n. 6
0
def pdf_metadata_save(pdf_file, metadata, substitute_all_metadata = False, make_backup = True):
    if type(make_backup) is str:
        bak_file = make_backup
    else:
        bak_file = os.path.splitext(pdf_file)[0] + ".bak"
    os.rename(pdf_file, bak_file)

    with open(bak_file, 'rb') as fin:
        pdf_in = PdfFileReader(fin)
        writer = PdfFileWriter()

        for page in range(pdf_in.getNumPages()):
            writer.addPage(pdf_in.getPage(page))

        infoDict = writer._info.getObject()

        info = pdf_in.documentInfo
        if not substitute_all_metadata:
            for key in info:
                #infoDict.update({NameObject(key): createStringObject(info[key])})
                infoDict.update({key: info[key]})

        for key in metadata:
            infoDict.update({NameObject('/' + key): createStringObject(str(metadata[key]))})

        with open(pdf_file, 'wb') as fout:
            writer.write(fout)

        if make_backup == False:
            os.unlink(bak_file)
Esempio n. 7
0
def extract_information(pdf_path):
    testread = ""
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        testread = pdf.getPage(92).extractText().strip()
        print(pdf.getPage(92).extractText().strip())
        number_of_pages = pdf.getNumPages()

    # txt = f"""
    # Information about {pdf_path}:

    # Author: {information.author}
    # Creator: {information.creator}
    # Producer: {information.producer}
    # Subject: {information.subject}
    # Title: {information.title}
    # Number of pages: {number_of_pages}
    # """
    print(testread)

    # define variables
    s = testread.strip()
    file = "file.mp3"

    # initialize tts, create mp3 and play
    tts = gTTS(s, 'en')
    tts.save(file)
    #os.system("mpg123 " + file)

    return information
Esempio n. 8
0
def pdf_mediabox(filename):
    pdf = PdfFileReader(open(filename, 'rb'))
    page = pdf.getPage(0).mediaBox
    width = page.getWidth()
    height = page.getHeight()
    return result_line(filename, round(float(height) * points_to_mm),
                       round(float(width) * points_to_mm))
Esempio n. 9
0
 def extract(self, report_path) -> pd.DataFrame:
     pdf = PdfFileReader(str(report_path))
     date = extract_datetime(extract_text(pdf, page=0))
     page, _ = find_table_page(pdf)
     page = self.unknown_age_matcher.sub('unknown', page)
     data_start = page.find('0-9')
     raw_data = page[data_start:]
     raw_data = raw_data.replace(
         ', ', ',')  # from 28/09, they write "1,5" as "1, 5"
     tokens = raw_data.split(' ')
     num_rows = 11
     num_columns = len(INPUT_COLUMNS)
     rows = []
     for i in range(num_rows):
         data_start = i * num_columns
         end = data_start + num_columns
         values = convert_values(tokens[data_start:end], COLUMN_CONVERTERS)
         row = [date, *values]
         rows.append(row)
     report_data = pd.DataFrame(rows, columns=['date', *INPUT_COLUMNS])
     report_data = normalize_table(report_data)
     output_data = compute_derived_columns(report_data)
     check_recomputed_columns_match_extracted_ones(  # sanity check
         extracted=report_data, recomputed=output_data)
     return output_data
Esempio n. 10
0
def get_file_mimetype(file):
    try:   
        mimeTypeFile = exiftool.ExifToolHelper().get_metadata(file.name)[0]["File:MIMEType"]
        if mimeTypeFile == "application/pdf":
            #Check is PDFA and Version
            with open(file.name, mode="rb") as fileData:
                input = PdfFileReader(fileData, strict=False)
                try:
                    metadata = input.getXmpMetadata()
                    if metadata:
                        pdfa=app.config["PDFA"]
                        nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"])
                        if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]:
                            mimeTypeFile = "application/pdfa"
                except (ExpatError):
                    app.logger.log(logging.WARNING, "File {0} has not well-formed XMP data, could not verify if application/pdf has PDF/A1 DOCINFO.".format(file.name))

        elif mimeTypeFile in app.config["GENERIC_MIMETYPES"]:
            mimeTypeFile = magic.from_file(file.name, mime=True)
            if mimeTypeFile in app.config["GENERIC_MIMETYPES"]:
                with open(file.name, mode="rb") as fileData:
                    documentTypeFile = magic.from_buffer(fileData.read(2048))
                    for (fileMimetype, fileFormat) in itertools.zip_longest(app.config["FILEMIMETYPES"], app.config["FILEFORMATS"]): 
                        if documentTypeFile in fileFormat:
                            mimeTypeFile = fileMimetype
    except (ValueError, PdfReadError):
        mimeTypeFile = "Unknown/Corrupted"
    return mimeTypeFile
Esempio n. 11
0
    def _extract(self, report_path) -> pd.DataFrame:
        num_rows = 11
        num_columns = len(INPUT_COLUMNS)

        pdf = PdfFileReader(str(report_path))
        date = extract_datetime(extract_text(pdf, page=0))
        page, _ = find_table_page(pdf)
        page = self.unknown_age_matcher.sub("unknown", page)
        data_start = page.find("0-9")
        # on 2020-09-28, they wrote floats like "1, 5"
        raw_data = page[data_start:].replace(", ", ",")
        tokens = raw_data.split()
        # In some cases, PyPDF3 doesn't read the token "≥90" (probably a bug),
        # so I insert that manually in case is missing. Couldn't the token in
        # that position be "90" by coincidence? Nope. If "≥90" is missing, the
        # token in that position is the cumulative total of cases with age >= 90
        # which has never been equal to 90 (and never will be).
        if tokens[9 * num_columns] not in {"90", ">90", "≥90"}:
            tokens.insert(9 * num_columns, ">=90")
        rows = []
        for i in range(num_rows):
            start = i * num_columns
            end = start + num_columns
            row_tokens = tokens[start:end]
            try:
                values = convert_values(row_tokens, COLUMN_CONVERTERS)
            except ValueError or TypeError as err:
                logger.debug('Error in row %d: ')
                raise TableExtractionError(
                    f"\nError while converting values of row {i}: {err}.\n"
                    f"Row tokens: {' | '.join(row_tokens)}")
            row = [date, *values]
            rows.append(row)
        report_data = pd.DataFrame(rows, columns=["date", *INPUT_COLUMNS])
        return report_data
Esempio n. 12
0
    def test_PdfReaderJpegImage(self):
        '''
        Test loading and parsing of a file. Extract the image of the file and compare to expected
        textual output. Expected outcome: file loads, image matches expected.
        '''

        with open(os.path.join(RESOURCE_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
            # Load PDF file from file
            ipdf = PdfFileReader(inputfile)

            # Retrieve the text of the image
            with open(os.path.join(RESOURCE_ROOT, 'jpeg.txt'),
                      'r') as pdftext_file:
                imagetext = pdftext_file.read()

            ipdf_p0 = ipdf.getPage(0)
            xObject = ipdf_p0['/Resources']['/XObject'].getObject()
            data = xObject['/Im4'].getData()

            # Compare the text of the PDF to a known source
            self.assertEqual(
                binascii.hexlify(data).decode(),
                imagetext,
                msg=
                'PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
                % (imagetext, binascii.hexlify(data).decode()))
Esempio n. 13
0
    def test_PdfReaderFileLoad(self):
        '''
        Test loading and parsing of a file. Extract text of the file and compare to expected
        textual output. Expected outcome: file loads, text matches expected.
        '''

        with open(os.path.join(RESOURCE_ROOT, 'crazyones.pdf'),
                  'rb') as inputfile:
            # Load PDF file from file
            ipdf = PdfFileReader(inputfile)
            ipdf_p1 = ipdf.getPage(0)

            # Retrieve the text of the PDF
            with open(os.path.join(RESOURCE_ROOT, 'crazyones.txt'),
                      'rb') as pdftext_file:
                pdftext = pdftext_file.read()

            ipdf_p1_text = ipdf_p1.extractText().replace('\n',
                                                         '').encode('utf-8')

            # Compare the text of the PDF to a known source
            self.assertEqual(
                ipdf_p1_text,
                pdftext,
                msg=
                'PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n'
                % (pdftext, ipdf_p1_text))
Esempio n. 14
0
def get_reader(filename, password):
    global old_file
    try:
        old_file = open(filename, 'rb')
    except Exception as err:
        print('文件打开失败!' + str(err))
        write_result(filename, '文件打开失败')
        return None

    # 创建读实例

    pdf_reader = PdfFileReader(old_file, strict=False)
    # 解密操作
    if pdf_reader.isEncrypted:
        if password is None:
            print('%s文件被加密,需要密码!' % filename)
            write_result(filename, '文件需要密码')
            return None
        else:
            if pdf_reader.decrypt(password) != 1:
                print('%s密码不正确!' % filename)
                return None
    # if 'old_file' in locals(): # 这句话需要使用字符串格式,否则无法关闭文件
    #     old_file.close()
    return pdf_reader
Esempio n. 15
0
        def pypdf3():
            """Much slower than PyPDF3 method."""
            # 5b. Get our files ready
            document_reader = PdfFileReader(document)
            output_file = PdfFileWriter()

            # Number of pages in input document
            page_count = document_reader.getNumPages()

            # Watermark objects
            watermark_reader = PdfFileReader(watermark)
            wtrmrk_page = watermark_reader.getPage(0)
            wtrmrk_width = (wtrmrk_page.mediaBox.getWidth() / 2) + 0
            wtrmrk_height = (wtrmrk_page.mediaBox.getHeight() / 2) + 80
            wtrmrk_rotate = -int(Info(watermark_reader).rotate) if Info(watermark_reader).rotate is not None else 0

            # 5c. Go through all the input file pages to add a watermark to them
            for page_number in range(page_count):
                # Merge the watermark with the page
                if not self.underneath:
                    input_page = document_reader.getPage(page_number)
                    if wtrmrk_rotate != 0:
                        input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height)
                    else:
                        wtrmrk_width = 0
                        wtrmrk_height = 0
                        input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height)
                else:
                    size = Info(document_reader).dimensions
                    input_page = PageObject().createBlankPage(document_reader, size['w'], size['h'])
                    if wtrmrk_rotate != 0:
                        input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height)
                    else:
                        wtrmrk_width = 0
                        wtrmrk_height = 0
                        input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height)
                    input_page.mergePage(document_reader.getPage(page_number))

                # Add page from input file to output document
                output_file.addPage(input_page)

            # 5d. finally, write "output" to PDF
            with open(output_filename, "wb") as outputStream:
                output_file.write(outputStream)
            return output_filename
Esempio n. 16
0
def overlay_pdfs(top_pdf, bottom_pdf, destination):
    """
    Overlay PDF objects to files
    :param top_pdf: PDF object to be placed on top
    :param bottom_pdf: PDF file to be placed underneath
    :param destination: Desintation path
    """
    drawing = PdfFileReader(top_pdf)    # Create new PDF object
    template = PdfFileReader(bottom_pdf)    # read your existing PDF

    # add the "watermark" (which is the new pdf) on the existing page
    page = template.getPage(0)
    page.mergePage(drawing.getPage(0))
    output = PdfFileWriter()    # Create new PDF file
    output.addPage(page)

    # finally, write "output" to a real file
    with open(destination, "wb") as outputStream:
        output.write(outputStream)
    def start_Encryption(self):
        global filename

        try:

            if self.filename[0] and self.userPassword.text(
            ) and self.ownerPassword.text():

                pfw = PdfFileWriter()
                pdffile = PdfFileReader(self.filename[0])

                total_pages = pdffile.numPages

                for page in range(total_pages):
                    current_page = pdffile.getPage(page)
                    pfw.addPage(current_page)

                pfw.encrypt(self.userPassword.text(),
                            self.ownerPassword.text())

                file = open(self.filename[0].replace('.pdf', '_encrypted.pdf'),
                            'wb')
                pfw.write(file)
                file.close()

                msg = QMessageBox()
                msg.setWindowTitle('Done')
                msg.setIcon(QMessageBox.Information)
                msg.setText('File encryption done successfully.')
                msg.exec_()

                self.filename = ''

                self.userPassword.setText('')
                self.ownerPassword.setText('')
                self.userPassword.setDisabled(True)
                self.ownerPassword.setDisabled(True)
                self.startEncryption.setDisabled(True)

            else:
                if self.ownerPassword.text() == '':
                    msg = QMessageBox()
                    msg.setWindowTitle('Error')
                    msg.setIcon(QMessageBox.Critical)
                    msg.setText('Owner Password Field is Empty.')
                    msg.exec_()

                if self.userPassword.text() == '':
                    msg = QMessageBox()
                    msg.setWindowTitle('Error')
                    msg.setIcon(QMessageBox.Critical)
                    msg.setText('User Password Field is Empty.')
                    msg.exec_()
        except:
            pass
Esempio n. 18
0
def pdf_metadata_load(pdf_file):
    with open(pdf_file, 'rb') as fin:
        pdf_in = PdfFileReader(fin)
        writer = PdfFileWriter()

        for page in range(pdf_in.getNumPages()):
            writer.addPage(pdf_in.getPage(page))

        infoDict = writer._info.getObject()

    return pdf_in.documentInfo
Esempio n. 19
0
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    xmp_info = None

    if use_pdf_meta:
        with open(tmp_file_path, 'rb') as f:
            pdf_file = PdfFileReader(f)
            doc_info = pdf_file.getDocumentInfo()
            xmp_info = parse_xmp(pdf_file)

    if xmp_info:
        author = ' & '.join(split_authors(xmp_info['author']))
        title = xmp_info['title']
        subject = xmp_info['subject']
        tags = xmp_info['tags']
        languages = xmp_info['languages']
        publisher = xmp_info['publisher']
    else:
        author = u'Unknown'
        title = ''
        languages = [""]
        publisher = ""
        subject = ""
        tags = ""

    if doc_info:
        if author == '':
            author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown'
        if title == '':
            title = doc_info.title if doc_info.title else original_file_name
        if subject == '':
            subject = doc_info.subject or ""
        if tags == '' and '/Keywords' in doc_info:
            if isinstance(doc_info['/Keywords'], bytes):
                tags = doc_info['/Keywords'].decode('utf-8')
            else:
                tags = doc_info['/Keywords']
    else:
        title = original_file_name

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags=tags,
        series="",
        series_id="",
        languages=','.join(languages),
        publisher=publisher,
        pubdate="",
        identifiers=[])
Esempio n. 20
0
    def pypdf3(self):
        with open(self.file_name, 'rb') as pdf_in:
            pdf_writer = PdfFileWriter()
            pdf_reader = PdfFileReader(pdf_in)
            for pagenum in range(pdf_reader.numPages):
                page = pdf_reader.getPage(pagenum)
                page.rotateClockwise(self.rotation)
                pdf_writer.addPage(page)

            with open(self.outfn, 'wb') as pdf_out:
                pdf_writer.write(pdf_out)
        return self.outfn
Esempio n. 21
0
def merge_pdfs(paths, output):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            # Add each page to the writer object
            pdf_writer.addPage(pdf_reader.getPage(page))

    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)
Esempio n. 22
0
def has_PDFA_XMP(file):
    try:
        with open(file, mode="rb") as fileData:
            xmpfile = PdfFileReader(fileData, strict=False)
            metadata = xmpfile.getXmpMetadata()
            if metadata is not None:
                pdfa=app.config["PDFA"]
                nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"])
                if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]:
                    return True
            return False
    except:
        return False
Esempio n. 23
0
    def pdfMerge(self, savePath, pdfPath, watermarkPdfPath):
        pdfFile = open(pdfPath, 'rb')
        pdfReader = PdfFileReader(pdfFile, strict=False)

        watermarkPdfFile = open(watermarkPdfPath, 'rb')
        watermarkPdf = PdfFileReader(watermarkPdfFile, strict=False).getPage(0)

        pdfWriter = PdfFileWriter()

        for pageNum in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(pageNum)

            x = (pageObj.mediaBox[2] - watermarkPdf.mediaBox[2]) / 2
            y = (pageObj.mediaBox[3] - watermarkPdf.mediaBox[3]) / 2

            pageObj.mergeTranslatedPage(page2=watermarkPdf,
                                        tx=x,
                                        ty=y,
                                        expand=False)

            pdfWriter.addPage(pageObj)

        resultFile = open(savePath, 'wb')
        pdfWriter.write(resultFile)
Esempio n. 24
0
def openFiles(fileformat="", fileName=""):

    if (fileformat == "pdf"):
        pdfFileHandler = PdfFileReader(open(fileName, 'rb'))
        processPDFFiles(pdfFileHandler)
    if (fileformat == "xlsx"):
        xlsxdoc = xlrd.open_workbook(fileName)
        processXlsxFiles(xlsxdoc)
    if (fileformat == "docx"):
        docxFileHandler = docx.Document(fileName)
        processDocxFiles(docxFileHandler)
    if (fileformat == "csv"):
        csv.register_dialect('myDialect', delimiter=',', skipinitialspace=True)
        with open(fileName, 'r') as csvfile:
            csvreader = csv.DictReader(csvfile)
            processCSVFiles(csvreader)
Esempio n. 25
0
 def _reader(path, password, prompt):
     """Read PDF and decrypt if encrypted."""
     pdf = PdfFileReader(path) if not isinstance(path, PdfFileReader) else path
     # Check that PDF is encrypted
     if pdf.isEncrypted:
         # Check that password is none
         if not password:
             pdf.decrypt('')
             # Try and decrypt PDF using no password, prompt for password
             if pdf.isEncrypted and prompt:
                 print('No password has been given for encrypted PDF ', path)
                 password = input('Enter Password: ')
             else:
                 return False
         pdf.decrypt(password)
     return pdf
Esempio n. 26
0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(BytesIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo() or {}
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
def split_pdf(myfile):
    pdf_in_file = open('/tmp/' + myfile, 'rb')
    inputpdf = PdfFileReader(pdf_in_file)
    pages_no = inputpdf.numPages
    print(pages_no)
    output = PdfFileWriter()
    for i in range(pages_no // 50):
        output.addPage(inputpdf.getPage(i * 50))
        if i * 50 + 1 < inputpdf.numPages:
            output.addPage(inputpdf.getPage(i * 50 + 1))
            print('/tmp/document-page%s.pdf' % i)
        newname = 'document-page%s.pdf' % i
        print(newname)
        with open("/tmp/document-page%s.pdf" % i, "wb") as outputStream:
            output.write(outputStream)
            client.upload_file('/tmp/' + newname, destbucketName,
                               'extracted-pdf/' + newname)
Esempio n. 28
0
def inject_pdf_links(filepath: str,
                     pdf_data: bytes,
                     links: Iterable[Link],
                     size_relative: bool = True) -> None:
    """
    Injects links into a pdf file data
    :param filepath: the output file path for the pdf
    :param pdf_data: the source pdf data as bytes
    :param links: an iterable of `Link` objects
    :param size_relative: if True (default) the coordinates and sizes of the links'
        bounding boxes must be relative [0~1] to the width of the pdf page,
        otherwise their absolute values are used
    """
    pdf_stream: BytesIO = BytesIO(pdf_data)
    source_pdf: PdfFileReader = PdfFileReader(pdf_stream)
    pdf_writer: PdfFileWriter = PdfFileWriter()
    pdf_writer.appendPagesFromReader(source_pdf)
    pdf_page_trim_box: RectangleObject = source_pdf.getPage(0).trimBox
    pdf_page_box: SizedBox = SizedBox(
        x=pdf_page_trim_box[0],
        y=pdf_page_trim_box[1],
        width=pdf_page_trim_box[2] - pdf_page_trim_box[0],
        height=pdf_page_trim_box[3] - pdf_page_trim_box[1],
    )
    pdf_scale: float = pdf_page_box.width if size_relative else 1.0
    link: Link
    for link in links:
        link_box: SizedBox = SizedBox(
            x=pdf_page_box.x + link.box.x * pdf_scale,
            y=pdf_page_box.y + link.box.y * pdf_scale,
            width=link.box.width * pdf_scale,
            height=link.box.height * pdf_scale,
        )
        # pdf coord system is bottom-left, so invert y
        link_box.y = pdf_page_box.y1 - link_box.y - link_box.height
        # noinspection PyTypeChecker
        pdf_writer.addURI(
            pagenum=0,
            uri=link.uri,  # Broken type annotation in PyPDF3
            rect=[link_box.x0, link_box.y0, link_box.x1, link_box.y1],
            border=[0, 0, 0],
        )
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "wb") as out_fp:
        pdf_writer.write(out_fp)
Esempio n. 29
0
def start():
    from PyPDF3 import PdfFileReader
    import glob
    print("Put PDF file in pdfs/")
    print("Which PDF file would you like to read the meta data for?")
    for d in glob.iglob("pdfs/*"):
        if "emptyfile" not in d:
            print(d.replace("pdfs/"))
    ans = str(input("> "))
    if ".pdf" in ans:
        pass
    else:
        ans = ans + ".pdf"
    pdffile = PdfFileReader(file=(ans, 'rb'))
    docInfo = pdffile.getDocumentInfo()
    for metaItem in docInfo:
        print("- " + metaItem + ":" + docInfo[metaItem])
    print("\n")
def open_all_pdfs(directory, file_dict):
    # This is one hell of a grep line, but it works.
    # One issue, is that due to the table numbers, sometimes you end up with a trailing '.' on the name.
    # This will be fixed later
    bookmark_grep = r"([a-zA-Z\d.]*[ -]*[a-zA-Z& ]+[\d]*[.]?[\d]*)"
    list_of_pdf_obj = {}

    # Iterate through the dictionary of files, opening them as PdfFileReader objects
    for num, filename in file_dict.items():
        full_path = os.path.join(directory, filename)
        bookmark_name = re.search(bookmark_grep, filename).group(1)

        # If the last letter of the name is '.' then we remove it. Side effect of the bookmark_grep
        if bookmark_name[-1] == ".":
            bookmark_name = bookmark_name[0:-1]

        # Open the pdf and store it in the dict
        list_of_pdf_obj[bookmark_name] = PdfFileReader(open(full_path, 'rb'), False)
    return list_of_pdf_obj