コード例 #1
0
def combina_foi_din_diferite_pdf():
    """1. Find all pdf in a folder."""
    pdfFiles = []
    pdfWriter = PyPDF3.PdfFileWriter()
    for filename in os.listdir('.'):
        if filename.endswith('.pdf'):
            pdfFiles.append(filename)
    pdfFiles.sort(key=str.lower)
    #print(pdfFiles)
    pdfFiles = ['3340063.pdf', 'bus.pdf']  # pentru test
    print(pdfFiles)  # pentru test
    # pdfWriter = PyPDF3.PdfFileWriter()
    # """2. Open each pdf."""
    # pdfFiles = []
    for filename in pdfFiles:
        pdfFileObj = open(filename, 'rb')
        pdfReader = PyPDF3.PdfFileReader(pdfFileObj)
        print(pdfReader)
    # """3. Add each page."""
    for filename in pdfFiles:
        for pageNum in range(1, pdfReader.numPages):
            pageObj = pdfReader.getPage(pageNum)
            pdfWriter.addPage(pageObj)
        print(pdfWriter)
    # """4. Save the result."""
    for filename in pdfFiles:
        for pageNum in range(1, pdfReader.numPages):
            pdfOutput = open('rezultat_merging.pdf', 'wb')
            pdfWriter.write(pdfOutput)
            pdfOutput.close()
コード例 #2
0
def decrypt():
    pdfName = input('Enter the name of PDF file to decrypt:\n')
    pdfName = path.abspath(pdfName)

    # Check if entered filename is valid
    if not path.exists(pdfName) or pdfName[-4:].lower() != '.pdf':
        print(f'The filename {pdfName} is not a PDF.')
        sys.exit()

    pdfFile = open(pdfName, 'rb')
    pdfReader = PyPDF3.PdfFileReader(pdfFile)

    # Password for decryption
    password = input('Enter the password to decrypt the PDF: \n')
    pdfReader.decrypt(password)
    pdfWriter = PyPDF3.PdfFileWriter()

    # Loop through the pages and add them to pdfWriter
    for pageNum in range(pdfReader.numPages):
        pdfWriter.addPage(pdfReader.getPage(pageNum))

    # Save the resulting PDF to a file
    decryptedPdf = open('decrypted' + path.basename(pdfName), 'wb')
    pdfWriter.write(decryptedPdf)
    decryptedPdf.close()
    print(f'File decrypted and saved as decrypted{path.basename(pdfName)}')
コード例 #3
0
    def add_barcode(self, barcode, original_path, x_pos=None, y_pos=None):
        """
        Adds a barcode to the first page of a document

        :param barcode: Barcode to add to the file specified in original_path
        :param original_path: Path to the original file.
        :param y_pos: Position of the bottom of the barcode measure in mm from the bottom of the page.
        :return:
        """
        merger = PyPDF3.PdfFileReader(original_path)
        barcode_doc = PyPDF3.PdfFileReader(
            self.barcode_doc.generate_barcode(barcode)).getPage(0)
        first_page = merger.getPage(0)
        if y_pos is not None:
            barcode_target_y = self.get_y_pos(first_page.mediaBox[3], y_pos)
        else:
            barcode_target_y = 0
        centre_pos = float(first_page.mediaBox[2]) / 2 - float(
            barcode_doc.mediaBox[2]) / 2
        first_page.mergeTranslatedPage(barcode_doc, centre_pos,
                                       barcode_target_y)
        out = PyPDF3.PdfFileWriter()
        out.addPage(first_page)
        [out.addPage(page) for page in merger.pages[1:]]
        filename = original_path[original_path.rfind('/'):original_path.
                                 rfind('.')]
        output_filename = f"{self.save_path}/{filename}_{barcode}_coded.pdf"
        with open(output_filename, 'wb') as output:
            out.write(output)
コード例 #4
0
def encripteaza(pdf_file):
    pdfFile = open(pdf_file, 'rb')
    pdfReader = PyPDF3.PdfFileReader(pdfFile)
    pdfWriter = PyPDF3.PdfFileWriter()
    for pageNum in range(pdfReader.numPages):
        pdfWriter.addPage(pdfReader.getPage(pageNum))
    pdfWriter.encrypt('swordfish')  # parola cu care encripteaza
    resultPdf = open('rezultat_encriptare.pdf', 'wb')
    pdfWriter.write(resultPdf)
    resultPdf.close()
コード例 #5
0
def roteste_pagina(pdf_file):  # roteste prima pagina si face din ea un nou pdf
    minutesFile = open(pdf_file, 'rb')
    pdfReader = PyPDF3.PdfFileReader(minutesFile)
    page = pdfReader.getPage(0)
    page.rotateClockwise(90)
    pdfWriter = PyPDF3.PdfFileWriter()
    pdfWriter.addPage(page)
    resultPdfFile = open('rezultat_rotire.pdf', 'wb')
    pdfWriter.write(resultPdfFile)
    resultPdfFile.close()
    minutesFile.close()
コード例 #6
0
def overlay(pdf_file):
    minutesFile = open(pdf_file, 'rb')
    pdfReader = PyPDF3.PdfFileReader(minutesFile)
    minutesFirstPage = pdfReader.getPage(0)
    pdfWatermarkReader = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb'))
    minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))
    pdfWriter = PyPDF3.PdfFileWriter()
    pdfWriter.addPage(minutesFirstPage)
    for pageNum in range(1, pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)
    resultPdfFile = open('rezultat_overlay.pdf', 'wb')
    pdfWriter.write(resultPdfFile)
    minutesFile.close()
    resultPdfFile.close()
コード例 #7
0
def decripteaza():
    pdfReader = PyPDF3.PdfFileReader(open('encrypted.pdf', 'rb'))
    print(pdfReader.isEncrypted)
    # print(pdfReader.getPage(0))
    # print(pdfReader.getPage())
    print(pdfReader.decrypt('rosebud'))
    print(pageObj=pdfReader.getPage(0))
コード例 #8
0
ファイル: PDFSearch.py プロジェクト: alexbarnhill/PDFSearch
def search(path, term, flags):

    # Check every item in the directory
    for item in listdir(path):
        # If the item is another directory, enter the directory and keep searching
        if isdir(join(path, item)):
            search(join(path, item), term, flags)

        if isfile(join(path, item)):
            filename, file_extension = splitext(item)

            # We only care about PDFs
            if file_extension == ".pdf":

                print_searching("Searching in: " + filename + " ... ")
                pdf_file_object = open(join(path, item), 'rb')
                pdf_reader = PyPDF3.PdfFileReader(pdf_file_object)

                # Go through each page in the pdf
                for page_number in range(0, pdf_reader.getNumPages()):
                    page = pdf_reader.getPage(page_number)
                    text = page.extractText()
                    # If the term exists in the PDF, print out the file and the page

                    if re.search(term, text, flags=flags):
                        print_found("\tFOUND: " + join(path, item) +
                                    " Page: " + str(page_number))

                # Close after searching
                pdf_file_object.close()
コード例 #9
0
def parse_pdf(doc_name):
    """
    Reads an Adobe PDF document given filepath and
    divides it into individual clauses

    Parameters
    ----------
    doc_name: string
        filepath of the Adobe PDF document with pdf extension

    Returns
    -------
    list of string
        an array of individual clauses of the EULA
    """

    document = PyPDF3.PdfFileReader(doc_name)
    pageNum = document.getNumPages()
    lst_output = []
    clause = ""
    for int_page in range(pageNum):
        lst_text = document.getPage(int_page).extractText().split(" \n")

        for text in lst_text:
            # clean the text
            txt = text.strip()
            txt = re.sub('\n', '', txt)
            if txt == "":
                continue
            clause = " ".join([clause, txt])
            clause = clause.strip()
            if txt[-1] == "." and len(clause) > 30:
                lst_output.append(clause)
                clause = ""
    return (lst_output)
コード例 #10
0
    def _import_pdf_fields(self):
        """
        Parameters: None
        Returns:    None
        """

        self.pdf_fields = {}
        # Import the original PDF.
        pdf = self.pdf_file_path
        #pdf = codecs.open(self.pdf_file_path, 'rb', encoding='utf-8')
        # Set the PDF fields.
        self.pdf_fields['o'] = PyPDF3.PdfFileReader(pdf).getFields()
        # Import the copy PDF.
        pdf = self.pdf_copy_file_path
        #pdf = codecs.open(self.pdf_copy_file_path, 'rb', encoding='utf-8')
        # Set the PDF fields.
        self.pdf_fields['c'] = PyPDF3.PdfFileReader(pdf).getFields()
コード例 #11
0
    def _get_fields(self):
        """
        Parameters: None
        Returns:    None
        """

        # Import the PDF fields as a dictionary.
        self._fields = PyPDF3.PdfFileReader(self.pdf_file_path).getFields()
コード例 #12
0
def extrage_text(pdf_file):
    pdfFileObj = open(pdf_file, 'rb')
    pdfReader = PyPDF3.PdfFileReader(pdfFileObj)
    pdfReader.numPages
    pageObj = pdfReader.getPage(0)
    pageObj.extractText()
    print(pdfReader.numPages)
    print(pageObj.extractText())
コード例 #13
0
def append_pdf():  # append a pdf to another pdf (merge 2 pdf-uri)
    pdf1File = open('meetingminutes.pdf', 'rb')
    pdf2File = open('meetingminutes2.pdf', 'rb')
    pdf1Reader = PyPDF3.PdfFileReader(pdf1File)
    pdf2Reader = PyPDF3.PdfFileReader(pdf2File)
    pdfWriter = PyPDF3.PdfFileWriter()
    for pageNum in range(pdf1Reader.numPages):
        pageObj = pdf1Reader.getPage(pageNum)
        pdfWriter.addPage(pageObj)
    for pageNum in range(pdf2Reader.numPages):
        pageObj = pdf2Reader.getPage(pageNum)
        pdfWriter.addPage(pageObj)
    pdfOutputFile = open('combinedminutes1.pdf', 'wb')
    pdfWriter.write(pdfOutputFile)
    pdfOutputFile.close()
    pdf1File.close()
    pdf2File.close()
コード例 #14
0
 def extract_data_from_pdf(self,pdf_file):
     page_content = ""
     pdfFileObj = open(pdf_file,'rb')
     pdfReader = PyPDF3.PdfFileReader(pdfFileObj)
     number_of_pages = pdfReader.getNumPages()
     for page_number in range(number_of_pages):
         page = pdfReader.getPage(page_number)
         page_content = page_content + page.extractText()   
     return page_content
コード例 #15
0
def simple():
    # pdf_file = open('encrypted.pdf','rb')
    pdf_file = open('meetingminutes.pdf', 'rb')
    pdf_reader = PyPDF3.PdfFileReader(pdf_file)

    print(pdf_reader.isEncrypted)
    # pdf_reader.decrypt('rosebud')
    print(pdf_reader.numPages)

    page = pdf_reader.getPage(1)
    print(page.extractText())
def extractContent(content=""):
    fileNames = []
    # numPageBooks = []
    pdfFileText = []
    pdfFileReader = ''
    for files in glob.glob("Resource/*.pdf"):
        fileNames.append(files)
    for i in range(len(fileNames)):
        pdfFile = open(fileNames[i], 'rb')
        pdfFileReader = pyPDF.PdfFileReader(fileNames[i])
        if (pdfFileReader.isEncrypted):
            pdfFile = pikepdf.open(fileNames[i])
            #pdfFile.save(fileNames[i])
            print("%s decrypted!" % fileNames[i])
            pdfFileReader = pyPDF.PdfFileReader(fileNames[i])
        #numPageBooks.append(pdfFileReader.numPages)
        pdfText = pdfFileReader.getPage(100)
        pdfText = pdfText.extractText()
        pdfFileText.append(pdfText)
        print(pdfFileText)
コード例 #17
0
ファイル: detail.py プロジェクト: hiiragiizawa/printer-rpi
    def pdf2page_count(self, path):
        try:
            pdfFileObj = open(path, 'rb')
            pdfReader = PyPDF3.PdfFileReader(pdfFileObj, strict=False)
        except Exception as e:
            self._show_warning(
                'Unable to process the file,\nplease contact our customer support.'
            )
            Clock.schedule_once(self._go_back, 3)
            return 1

        return pdfReader.numPages
コード例 #18
0
ファイル: mergepdf.py プロジェクト: nariba/util-scripts
def main():
    filelist = os.listdir(input())
    print(filelist)
    exit
    merger = PyPDF3.PdfFileMerger()

    for i in filelist:
        pathroot, ext = os.path.splitext(i)
        if ext == '.pdf':
            merger.append(i)

    merger.write('./merger.pdf')
    merger.close()
コード例 #19
0
ファイル: common.py プロジェクト: janluke/iccas-code
def find_table_page(pdf_path) -> Tuple[PageObject, int]:
    """ Return the (1-based) index of the page containing the table table. """
    pdf = PyPDF3.PdfFileReader(str(pdf_path))
    num_pages = pdf.getNumPages()

    for i in range(1, num_pages
                   ):  # skip the first page, the table is certainly not there
        page = pdf.getPage(i)
        text = page.extractText().replace('\n', '')
        if TABLE_CAPTION_PATTERN.search(text):
            return page, i + 1  # return a 1-based index
    else:
        raise TableExtractionError('could not find the table in the pdf')
コード例 #20
0
ファイル: DataSheet.py プロジェクト: yueyub/py_pdf_stm
 def __init__(self, datasheet_path):
     self.path = Path(datasheet_path)
     self.pdf_file = PyPDF3.PdfFileReader(self.path.open('rb'))
     self.plumber = pdfplumber.load(self.path.open('rb'))
     self.raw_outline = []
     self.tables, self.figures = {}, {}  # type: Dict
     self.table_of_content = DataSheetNode('ROOT', [0])
     self.table_root = DataSheetNode('TABLES', [-1])
     self.table_of_content.append(self.table_root)
     self.fallback_table: DataSheetTableNode = None
     self.flatten_outline()
     self.sort_raw_outline()
     self.collect_tables()
コード例 #21
0
ファイル: glopan.py プロジェクト: mortenengen/glowing-pancake
def split_pdf(pdffile: str):
    """Split a PDF file in one file per page.

    Args:
        pdffile (str): The name of the PDF file to split.
    """
    with open(pdffile, 'rb') as pdffile_handle:
        pdf_in = pypdf.PdfFileReader(pdffile_handle)
        num_pages = pdf_in.numPages
        pages = []
        if 'pdf' in pdffile.lower():
            file_first_name = pdffile.lower()[:pdffile.lower().index('.pdf')]

        for page in range(num_pages):
            pages.append(file_first_name + f'_p_{page}' + '.pdf')
            pdf_out = pypdf.PdfFileWriter()
            pdf_out.addPage(pdf_in.getPage(page))

            with open(pages[-1], 'wb') as stream:
                pdf_out.write(stream)

    return pages
コード例 #22
0
ファイル: glopan.py プロジェクト: mortenengen/glowing-pancake
def combine_pdfs(pdffiles: t.List[str], outfile: str):
    """Combine several PDF files to one.

    Args:
        pdffiles (list): The names of the PDF files to combine.
        outfile (str): The name of the PDF file to write.
    """
    pdf_merger = pypdf.PdfFileMerger()

    for this_pdf in pdffiles:
        pdf_merger.append(this_pdf)

    pdf_merger.write(outfile)
    pdf_merger.close()
コード例 #23
0
def simple2():
    pdf_file = open('meetingminutes.pdf', 'rb')
    pdf_reader = PyPDF3.PdfFileReader(pdf_file)

    pdf_file2 = open('meetingminutes2.pdf', 'rb')
    pdf_reader2 = PyPDF3.PdfFileReader(pdf_file)

    pdf_writer = PyPDF3.PdfFileWriter()

    for page_num in range(pdf_reader.numPages):
        pdf_obj = pdf_reader.getPage(page_num)
        pdf_writer.addPage(pdf_obj)

    for page_num in range(pdf_reader2.numPages):
        pdf_obj = pdf_reader2.getPage(page_num)
        pdf_writer.addPage(pdf_obj)

    pdf_file3 = open('combinedminutes.pdf', 'wb')
    pdf_writer.write(pdf_file3)

    pdf_file3.close()
    pdf_file.close()
    pdf_file2.close()
コード例 #24
0
    def find(self):

        if self.overwrite == True:

            onlyfiles = [
                f for f in listdir(self.rootdir)
                if isfile(join(self.rootdir, f))
            ]

            for i in onlyfiles:
                if " " in i:
                    os.rename(i, i.replace(" ", "_"))

        newdir = os.path.join(root, self.word)

        if os.path.exists(newdir):
            shutil.rmtree(newdir)

        os.makedirs(newdir)
        fil = []
        nums = []

        for subdir, dirs, files in os.walk(self.rootdir):

            for file in files:

                try:
                    pdf = (os.path.join(subdir, file))
                    pdfFileObj = open(pdf, 'rb')
                    pdfReader = PyPDF3.PdfFileReader(pdfFileObj)
                    if pdfReader.isEncrypted:
                        pdfReader.decrypt('')

                    nums.append(pdfReader.getNumPages())
                    fil.append(pdf)

                    for i in range(pdfReader.getNumPages()):
                        pageObj = pdfReader.getPage(i)
                        text = pageObj.extractText()

                        if text.find(self.word) != -1:
                            st = 'copy ' + str(pdf) + ' ' + str(newdir + "\\" +
                                                                file)
                            os.popen(st)
                            break

                except:
                    print(pdf + "not decryptable")

        return fil
コード例 #25
0
ファイル: TestClass.py プロジェクト: GuyDahan1/testproject
 def croptime(self, Path):
     while True:
         try:
             TestPdf = PyPDF3.PdfFileReader(Path, "rb")
             break
         except:
             print("File not found")
             Path = input("Enter new path\n")
     self.tPdf = TestPdf
     self.path = Path
     self.is_cropped = True
     logging.basicConfig(format='%(asctime)s - %(message)s',
                         level=logging.INFO,
                         filename='LOG.txt')  # Writing to log file
     logging.info('Test pdf attached')  # Writing to log file
コード例 #26
0
ファイル: TestClass.py プロジェクト: GuyDahan1/testproject
 def __crop__(self, name):
     pagNum = int(input("Enter page number\n"))
     upperX = int(input("please enter upper x cordinate\n"))
     upperY = int(input("please enter upper y cordinate\n"))
     lowerX = int(input("please enter lower x cordinate\n"))
     lowerY = int(input("please enter lower y cordinate\n"))
     path = self.getPath()
     writer = PyPDF3.PdfFileWriter()
     page = self.tPdf.getPage(pagNum)
     page.cropBox.setLowerLeft((lowerX, lowerY))
     page.cropBox.setUpperRight((upperX, upperY))
     writer.addPage(page)
     path = "Class\pdfFileHere\{0}.pdf".format(name)
     outstream = open(path, 'wb')
     writer.write((outstream))
     outstream.close()
コード例 #27
0
def get_grade_from_pdf_file(pdf_file):
    """
    Return a grade extracted from the PDF file, such as "10.0" or "9.5".
    :param pdf_file: file_path : string
    :return: grade : string
    """
    input1 = PyPDF3.PdfFileReader(open(pdf_file, "rb"), strict=False)
    page0 = input1.getPage(0)
    if '/Annots' in page0:
        for annot in page0['/Annots']:
            annot_obj = annot.getObject()
            if '/Contents' in annot_obj:
                content = str(annot_obj['/Contents'])
                if is_grade(content):
                    return content
    return ""
コード例 #28
0
    def download_workbook_pdf(self,
                              workbook: WorkbookItem,
                              dest_dir,
                              data_filters: dict = None,
                              page_type=None,
                              orientation=None):
        """

        :param workbook:
        :param dest_dir:
        :return:
        """
        self.server.workbooks.populate_views(workbook)

        _pdf_merger = PyPDF3.PdfFileMerger()
        _is_pdf_content_generated = False
        _pdf_file = os.path.join(dest_dir, workbook.name) + ".pdf"
        _vw_filters = PDFRequestOptions(page_type=page_type,
                                        orientation=orientation)

        if data_filters is None:
            data_filters = dict()

        for name, value in data_filters.items():
            _vw_filters.vf(name=quote_plus(name), value=quote_plus(value))

        log.info(
            "Exporting\nWorbook='%s' \nProject='%s' \nPage Type='%s' \nOrientation='%s' \nFilters='%s'\nFile='%s' "
            % (workbook.name, workbook.project_name, page_type, orientation,
               _vw_filters.view_filters, _pdf_file))

        for _view in workbook.views:
            _downloaded_wv = self._download_view_pdf(_view,
                                                     dest_dir=os.path.join(
                                                         dest_dir, 'views'),
                                                     view_filters=_vw_filters)
            _pdf_merger.append(_downloaded_wv)
            _is_pdf_content_generated = True
        if _is_pdf_content_generated:
            _pdf_merger.write(_pdf_file)
            _pdf_merger.close()
            log.info("Exported Workbook to pdf %s" % _pdf_file)
        else:
            raise Exception("No Pdf Content Generated")
        return _pdf_file
コード例 #29
0
ファイル: search.py プロジェクト: harshitsengar/pdf-searcher
def search(query):
    results = []
    if query is not None:
        for pdf in pdfFiles:
            pdfName = "PDF_NAME: " + pdf.split('/')[-1]
            try:
                fileObject = PyPDF3.PdfFileReader(pdf)
                numPages = fileObject.getNumPages()
                for page in range(0, numPages):
                    pageObj = fileObject.getPage(page)
                    searchResults = re.search(query, pageObj.extractText(),
                                              re.IGNORECASE)
                    if searchResults:
                        pageResult = ("Page: {}".format(str(page + 1)))
                        results.append((pdfName, pageResult))
            except PyPDF3.utils.PdfReadError:
                pass
        write(query, results)
コード例 #30
0
    def pdf_search():
        redpdf = PyPDF3.PdfFileReader(open(pdfPath, 'rb'))

        count = 0

        for sida in range(0, redpdf.numPages):
            temptext = redpdf.getPage(sida)
            text = temptext.extractText()
            start = 0
            kontroll = 0

            while kontroll != -1:
                kontroll = text[start:].find(pdfWord)
                if kontroll != -1:
                    start += kontroll + len(pdfWord)
                    count = count + 1
                else:
                    break

        print(pdfWord, "finns", count, "gånger i dokumentet")