Example #1
0
string = 'HELLO ALL'
print(string.title())

# Split
string = 'hello Everyone'
print(string.split())

string = 'hello-Everyone'
print(string.split('-'))

# with maxsplit =1 it split for one occurance
string = 'www.all.com'
print(string.split('.', maxsplit=1))

# PDF Reading
import PyPDF2
f = open('F:\Algorithmica\MyCodes\DAAI Newbie document.pdf', 'rb')
# rb - Read file in binary format
pdf_reader = PyPDF2.PdfFileReader(f)
#Number of pages
print(pdf_reader.numPages)

Page_one = pdf_reader.getPage(0)
print(Page_one)

# Extracting text from page1
page_one_text = Page_one.extractText()
page_one_text
f.close()
        page['/Annots'].append(indir)
    else:
        page[NameObject('/Annots')] = ArrayObject([indir])

    if annot.popup:
        popup = _popup_annotation(indir)
        indir_popup = outpdf._addObject(popup)
        annot[NameObject('/Popup')] = indir_popup
        page['/Annots'].append(indir_popup)


if __name__ == '__main__':
    import sys
    import PyPDF2 as pyPdf
    try:
        inpdf = pyPdf.PdfFileReader(open(sys.argv[1], 'rb'))
    except (IndexError, IOError):
        print("Needs PDF file as an argument.")
        raise SystemExit
    annot1 = highlight_annotation([
        [100, 100, 400, 125]
    ], 'An argument is a connected series of statements intended to establish a proposition.',
                                  'Graham Chapman',
                                  'I came here for a good argument.')
    annot2 = text_annotation([100, 50, 125, 75], "No it isn't.", 'John Cleese',
                             "No you didn't.")
    page = inpdf.getPage(0)
    outpdf = pyPdf.PdfFileWriter()
    add_annotation(outpdf, page, annot1)
    add_annotation(outpdf, page, annot2)
    outpdf.addPage(page)
# This is a script to delete the final page in a pdf

import PyPDF2 as pdf

pdf1File = open('ANNEXE3.pdf', 'rb')


pdf1Reader = pdf.PdfFileReader(pdf1File)
pdfWriter = pdf.PdfFileWriter()

for pageNum in range(1,pdf1Reader.numPages):
    pageObj = pdf1Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
	
pdfOutputFile = open('outputpdf.pdf','wb')
pdfWriter.write(pdfOutputFile)
pdfOutputFile.close()
pdf1File.close()
import PyPDF2
import sys

filenamelist = sys.argv[1:]

filenamelist = ["etsy.pdf", "amazon.pdf"]


for filename in filenamelist:
    currentfile = PyPDF2.PdfFileReader(open(filename, 'rb'))
    firstpage = currentfile.getPage(2).extractText()
    if len(firstpage) > 0:
        if firstpage.split(" ")[0] == "Order":
            pdftype = "etsy"
        elif firstpage.split(" ")[0] == "Ship":
            pdftype = "amazon"
        else:
            pdftype = "error"
        print(firstpage)
    else:
        print(firstpage)
    
import pyttsx3
import PyPDF2
book = open("An_Introduction_to_Software_Engineering_and_Fault_.pdf", "rb")
pdfReader = PyPDF2.PdfFileReader(book)
pages = pdfReader.numPages
for num in range(0, pages - 1):
    page = pdfReader.getPage(num)
    text = page.extractText()
    bot = pyttsx3.init()
    bot.say(text)
    bot.runAndWait()
Example #6
0
#! python3

import sys
import os
import PyPDF2

password = sys.argv[1]
#Dodaj ściężkę do walk.os



for folderName, subfolders, filenames in os.walk('XXXX'):
    for filename in filenames:
        if filename.endswith('.pdf'):
            pdfReader = PyPDF2.PdfFileReader(open(os.path.join(folderName, filename), 'rb'))
            print(pdfReader)
            #print(pdfReader.isEncrypted)
            pdfWriter = PyPDF2.PdfFileWriter()
            for pageNum in range(pdfReader.numPages):
                pdfWriter.addPage(pdfReader.getPage(pageNum))
            pdfWriter.encrypt(password)
            filename = filename.strip('.pdf')
            print(filename)
            resultPdf = open(filename + '_encrypted.pdf', 'wb')
            pdfWriter.write(resultPdf)





Example #7
0
def make_pdf(dlg):
    # no file selected: treat like "Cancel"
    if not len(dlg.szr02.Table.data):  # no files there
        return None

    cdate = wx.DateTime.Now().Format("D:%Y%m%d%H%M%S-04'30'")
    ausgabe = dlg.btn_aus.GetPath()
    pdf_fle_out = open(ausgabe, "wb")
    pdf_out = PyPDF2.PdfFileWriter()
    aus_nr = 0  # current page number in output
    pdf_dict = {
        "/Creator": "PDF-Joiner",
        "/Producer": "PyMuPDF, PyPDF2",
        "/CreationDate": cdate,
        "/ModDate": cdate,
        "/Title": dlg.austit.Value,
        "/Author": dlg.ausaut.Value,
        "/Subject": dlg.aussub.Value
    }
    pdf_out.addMetadata(pdf_dict)
    parents = {}
    #==============================================================================
    # process one input file
    #==============================================================================
    for zeile in dlg.szr02.Table.data:
        dateiname = zeile[0]
        doc = dlg.FileList[dateiname]
        max_seiten = int(zeile[1])
        #==============================================================================
        # user input minus 1, PDF pages count from zero
        # also correct any inconsistent input
        #==============================================================================
        von = int(zeile[2]) - 1
        bis = int(zeile[3]) - 1

        von = max(0, von)  # "from" must not be < 0
        bis = min(max_seiten - 1, bis)  # "to" must not be > max pages - 1
        bis = max(von, bis)  # "to" cannot be < "from"
        rot = int(zeile[4])  # get rotation angle

        pdfin = PyPDF2.PdfFileReader(dateiname)
        for p in range(von, bis + 1):  # read pages from input file
            pdf_page = pdfin.getPage(p)
            if rot > 0:
                pdf_page.rotateClockwise(rot)  # rotate the page
            pdf_out.addPage(pdf_page)  # output the page

        # title = "infile [from-to (max.pages)]"
        if dlg.noToC.Value:  # no ToC wanted
            continue
        bm_main_title = "%s [%s-%s (%s)]" % \
              (os.path.basename(dateiname[:-4]).encode("latin-1"), von + 1,
               bis + 1, max_seiten)

        bm_main = pdf_out.addBookmark(bm_main_title, aus_nr, None, None, False,
                                      False, "/Fit")
        print(1, bm_main_title, aus_nr)

        parents[1] = bm_main  # lvl 1 bookmark is infile's title

        toc = fitz.GetToC(doc)  # get infile's table of contents
        bm_lst = []  # prepare the relevant sub-ToC
        for t in toc:
            if t[2] > von and t[2] <= bis + 1:  # relevant page range only
                bm_lst.append([
                    t[0] + 1,  # indent increased 1 level
                    t[1],  # the title
                    t[2] + aus_nr - von - 1
                ])  # new page number

        aus_nr += (bis - von + 1)  # increase output counter

        if bm_lst == []:  # do we have a sub-ToC?
            continue  # no, next infile
        # while indent gap is too large, prepend "filler" bookmarks to bm_lst
        while bm_lst[0][0] > 2:
            zeile = [bm_lst[0][0] - 1, "<>", bm_lst[0][2]]
            bm_lst.insert(0, zeile)
        # now add infile's bookmarks
        for b in bm_lst:
            bm = pdf_out.addBookmark(b[1].encode("latin-1"), b[2],
                                     parents[b[0] - 1], None, False, False,
                                     "/Fit")
            parents[b[0]] = bm

#==============================================================================
# all input files processed
#==============================================================================
    pdf_out.write(pdf_fle_out)
    pdf_fle_out.close()
    return ausgabe
Example #8
0
def get_pdf_title(pdf_file_path):
    pdf_reader = PyPDF2.PdfFileReader(open(pdf_file_path, "rb"))
    return pdf_reader.getDocumentInfo().title
Example #9
0
Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
'''

import sys
import PyPDF2
from PIL import Image

if (len(sys.argv) != 2):
    print("\nUsage: python {} input_file\n".format(sys.argv[0]))
    sys.exit(1)

pdf = sys.argv[1]

if __name__ == '__main__':
    input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
    page0 = input1.getPage(30)

    if '/XObject' in page0['/Resources']:
        xObject = page0['/Resources']['/XObject'].getObject()

        for obj in xObject:
            if xObject[obj]['/Subtype'] == '/Image':
                size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                data = xObject[obj].getData()
                if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
                    mode = "RGB"
                else:
                    mode = "P"

                if '/Filter' in xObject[obj]:
Example #10
0
def OCRPDF(
        source="",
        targetPath=None,
        page=None,
        nice=5,
        verbose=False,
        tesseract_config='--oem 1 -l best/eng -c preserve_interword_spaces=1 textonly_pdf=1',
        logger=None):

    try:

        output = PyPDF2.PdfFileWriter()

        #if this is a string..
        if isinstance(source, str):

            if verbose:
                (
                    logger.info if logger else print
                )("You passed a string in as source. Trying this as source pdf file path."
                  )

            page_count = PyPDF2.PdfFileReader(source).getNumPages()
            filename, file_extension = os.path.splitext(source)

            if (file_extension == ".pdf"):
                if verbose:
                    (logger.info if logger else print)(
                        "OCRUSREX - Try extracting Images from path: {0}".
                        format(source))

                if page is None:

                    if verbose:
                        (
                            logger.info if logger else print
                        )("\tOCRing entire document with total page count of: {0}"
                          .format(page_count))

                    for i in range(0, page_count):
                        if verbose:
                            (logger.info if logger else print)(
                                "\tOCRing page {0} of {1}".format(
                                    i + 1, page_count))
                        page_image_array = convert_from_path(source,
                                                             dpi=300,
                                                             first_page=i + 1,
                                                             last_page=i + 1)
                        pdf_page = ocrPilImage(image=page_image_array[0],
                                               nice=nice,
                                               config=tesseract_config)
                        output.addPage(pdf_page)

                else:

                    if verbose:
                        (logger.info if logger else print)(
                            "\tOCRing only page {0} of {1}".format(
                                page, page_count))

                    page_image_array = convert_from_path(source,
                                                         dpi=300,
                                                         first_page=page,
                                                         last_page=page)
                    output.addPage(
                        ocrPilImage(image=page_image_array[0],
                                    nice=nice,
                                    config=tesseract_config))

                    if verbose: print("Done")

        # IF source isn't a string, assume it's a file-like object. If incorrect, error handling will catch this.
        else:

            if verbose:
                (logger.info if logger else
                 print)("OCRUSREX - Try extracting Images from bytes object")
            page_count = PyPDF2.PdfFileReader(io.BytesIO(source)).getNumPages()

            if page is None:

                if verbose:
                    (logger.info if logger else print
                     )("\tOCRing entire document with total page count of: {0}"
                       .format(page_count))

                for i in range(0, page_count):
                    if verbose:
                        (logger.info if logger else print)(
                            "\tOCRing page {0} of {1}".format(
                                i + 1, page_count))
                    page_image_array = convert_from_bytes(source,
                                                          dpi=100,
                                                          first_page=i + 1,
                                                          last_page=i + 1)
                    output.addPage(
                        ocrPilImage(image=page_image_array[0],
                                    nice=nice,
                                    config=tesseract_config))

            else:
                if verbose:
                    (logger.info if logger else print)(
                        "\tOCRing only page {0} of {1}".format(
                            page, page_count))
                page_image_array = convert_from_bytes(source,
                                                      dpi=100,
                                                      first_page=page,
                                                      last_page=page)
                output.addPage(
                    ocrPilImage(image=page_image_array[0],
                                nice=nice,
                                config=tesseract_config))

        if verbose:
            (logger.info
             if logger else print)("OCRUSREX - Successfully processed!")

        #If targetPath was provided, assume that it's a string and valid path. Try to write.
        if targetPath:
            outputStream = open(targetPath, "wb")
            output.write(outputStream)
            outputStream.close()
            # upon success, return truthy values (in this case, True)
            return True

        #otherwise, return results as bytes obj
        else:
            output_file_obj = io.BytesIO()
            output.write(output_file_obj)
            return output_file_obj.getvalue()

    except Exception as e:
        (logger.error if logger else print)("ERROR - Exception: {0}".format(e))
        return None
#               Automate the Boring Stuff with Python 3
#               Ch 13 -  Working with PDF and Word Documents
# Rotating Pages
'''
The pages of a PDF can also be rotated in 90-degree increments with the rotateClockwise() and
rotateCounterClockwise() methods.
'''

import PyPDF2
minutesFile = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
page = pdfReader.getPage(0)
page.rotateClockwise(90)
pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(page)
resultPdfFile = open('rotatedPage.pdf', 'wb')
pdfWriter.write(resultPdfFile)
resultPdfFile.close()
minutesFile.close()
Example #12
0
def ocrPilImage(image=None, nice=5, config=""):
    ocred_page = pytesseract.image_to_pdf_or_hocr(image,
                                                  extension='pdf',
                                                  nice=nice,
                                                  config=config)
    return PyPDF2.PdfFileReader(io.BytesIO(ocred_page)).getPage(0)
Example #13
0
    partInfo = requests.get(
        'https://web-booktab.zanichelli.it/api/v1/resources_web/' + isbn +
        '/' + part.getAttribute("btbid") + '/config.xml',
        headers={'Cookie': cookie})

    #print('http://web.booktab.it/boooks_web/'+isbn+'/'+part.getAttribute("btbid")+'/config.xml')

    if partInfo.status_code != 200:
        continue

    partXML = parseString(partInfo.text)

    key = partXML.getElementsByTagName("content")[0].firstChild.nodeValue

    pdfUrl = ''

    for entry in partXML.getElementsByTagName("entry"):
        if entry.getAttribute("key") == key + ".pdf":
            pdfUrl = entry.firstChild.nodeValue + ".pdf"
            break

    pdf = requests.get(
        'https://web-booktab.zanichelli.it/api/v1/resources_web/' + isbn +
        '/' + part.getAttribute("btbid") + '/' + pdfUrl,
        headers={'Cookie': cookie})

    merger.append(PyPDF2.PdfFileReader(ResponseStream(pdf.iter_content(64))))

merger.write(input("Input a title for the file: ") + ".pdf")
Example #14
0
def read_files(pathorfile):
    if (type(pathorfile) == str):
        os.chdir(pathorfile)
        filelist = glob.glob("*")
    else:
        filelist = pathorfile

    # reading files(.docx, .txt, .xlsx, .ppt, .pdf)
    dfList = []
    filename = []
    for file in filelist:
        try:
            extension = os.path.splitext(file)[1]
            if extension == '.docx':
                filename.append(file)
                print(file)
                text = docx2txt.process(file)
                text = text.replace('\n', ' ')
                text = re.sub('\s+', ' ', text).strip()
                dfList.append(text)
            elif extension == '.pdf':
                filename.append(file)
                print(file)
                pdfFileObj = open(file, 'rb')
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
                pageObj = pdfReader.getPage(0)
                text = pageObj.extractText()
                text = text.replace('\n', ' ')
                text = re.sub('\s+', ' ', text).strip()
                dfList.append(text)
                pdfFileObj.close()
            elif extension == '.txt':
                filename.append(file)
                print(file)
                text = open(file, 'r').read()
                text = text.replace('\n', ' ')
                text = re.sub('\s+', ' ', text).strip()
                dfList.append(text)
            elif extension == '.xlsx':
                filename.append(file)
                print(file)
                xlsdf = pd.read_excel(file)
                xlsdf['new'] = xlsdf.astype(str).values.sum(axis=1)
                xls = ' '.join(xlsdf['new'].tolist())
                xls = xls.replace('\n', ' ')
                xls = re.sub('\s+', ' ', xls).strip()
                dfList.append(xls)
            elif extension == '.ppt':
                filename.append(file)
                print(file)
                prs = Presentation(file)
                text_runs = []
                for slide in prs.slides:
                    for shape in slide.shapes:
                        if not shape.has_text_frame:
                            continue
                        for paragraph in shape.text_frame.paragraphs:
                            for run in paragraph.runs:
                                text_runs.append(run.text)
                text_runs = ' '.join(text_runs)
                text_runs = text_runs.replace('\n', ' ')
                text_runs = re.sub('\s+', ' ', text_runs).strip()
                dfList.append(text_runs)
                prs.close()
            else:
                print("WRONG EXTENSION:")
                print(file)
                continue
        except:
            pass
    indir = 'C:/Users/Faiz Ali/Rezide/imp' #change with the path where all files and ML model is stored
    os.chdir(indir)
    allfilesDF = pd.DataFrame(dfList)
    allfilesDF.columns = ['data']
    allfilesDF.drop_duplicates(keep="first", inplace=True)
    allfilesDF.replace('', np.nan, inplace=True)
    allfilesDF.dropna(inplace=True)
    allfilesDF.reset_index(drop=True, inplace=True)
    return allfilesDF, filename
Example #15
0
def main():
    clear()
    while True:
        imprimir()
        try:
            option = int(input("Seleccione una opcion: "))

            if option == 1:  # DIPLOMAS
                pdfName = openFile()
                pathFile = newCarpeta(
                    pdfName, path2
                )  # Nueva Carpeta con el nombre del archivo a procesar

                fileList = convert_image(pdfName, pathFile, ext)
                os.chdir(pathFile)  # Go to the Directory
                for i in range(0, len(fileList)):
                    allTextfromImage = ocr_core(fileList[i])  # All text
                    subText = read_image(
                        allTextfromImage)  # Only text with need
                    # Rename image file with text
                    imageFile = rename_file(fileList[i], subText)
                    convert_pdf(imageFile, subText)  # convert to PDF again
                    os.unlink(imageFile
                              )  # Erase the images files using shutil module

                print('\nTotal de archivo renombrados y comprimidos: ' +
                      str(len(fileList)))
                #MessageBox.showinfo('Total de archivo renombrados y comprimidos:', str(len(fileList)) )
                #compress_file('pdf')  # compress all pdf files
                break
            elif option == 2:  # CARTAS
                pdfName = openFile()  # Open the pdf file
                pdfFileObj = open(pdfName, 'rb')
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj)  # Read info
                pathFile = newCarpeta(
                    pdfName, path2
                )  # Nueva Carpeta con el nombre del archivo a procesar
                pdf_splitter(pdfName, pdfReader, pathFile)
                print('\nFueron Procesadas: ' + str(pdfReader.numPages) +
                      ' páginas')
                break
            elif option == 3:  # ENVIAR CARTAS POR GMAIL
                clear()
                PASSWORD = input('Digite la clave de aplicaciones para Gmail: '
                                 )  #'xqwdzjbamvfiztnt'

                os.chdir(pathCartas)
                names, emails = get_contacts(contacts)  # read contacts
                message_template = read_template(msgTemplate)
                pathdirectory = getDirectory(
                )  # get Directory where the Cartas are
                attach = search_files(names, pathdirectory)

                s = smtplib.SMTP(host='smtp.gmail.com',
                                 port=587)  # set up the SMTP server
                s.starttls()
                s.login(MY_ADDRESS, PASSWORD)
                i = 0  #Iniciar la varaible i en ceros

                for name, email in zip(
                        names, emails):  # FOR EACH CONTACT, SEND THE EMAIL:
                    msg = MIMEMultipart()  # create a message
                    message = message_template.substitute(
                        PERSON_NAME=name.title()
                    )  # add in the actual person name to the message template

                    # setup the parameters of the message
                    msg['From'] = MY_ADDRESS
                    msg['To'] = email
                    msg['Subject'] = "Exámenes psicofísicos (SIMETRIC)"

                    msg.attach(MIMEText(message,
                                        'plain'))  # add in the message body

                    #ATTACH FILES TO THE EMAIL
                    filenameAtach = attach[i]
                    if filenameAtach != None:
                        with open(
                                filenameAtach, "rb"
                        ) as attachment:  # Open PDF file in binary mode
                            # Add file as application/octet-stream
                            # Email client can usually download this automatically as attachment
                            part = MIMEBase("application", "octet-stream")
                            part.set_payload(attachment.read())
                        encoders.encode_base64(
                            part
                        )  # Encode file in ASCII characters to send by email
                        part.add_header(
                            'Content-Disposition',
                            'attachment',
                            filename=filenameAtach
                        )  # Add header as key/value pair to attachment part
                        msg.attach(
                            part
                        )  # Add attachment to message and convert message to string
                        sendmailStatus = s.send_message(
                            msg)  # SEND THE MESSAGE WITH ATTACHED.
                        del msg

                        #Status message sent
                        if sendmailStatus != {}:
                            print(
                                '\nThere was a problem sending the email to %s: %s'
                                % (email, s.send_message))
                        else:
                            print(
                                '\nThe email to %s was sent correctly with the attached:\n%s'
                                % (email, filenameAtach))
                        i += 1
                    else:
                        # SEND THE MESSAGE WITHOUT ATTACHED.
                        #s.send_message(msg)
                        del msg
                        #Do not send a message
                s.quit()  # Terminate the SMTP session and close the connection
                break
            elif option == 4:  # SLPIT PDF
                pdfName = openFile()
                pdfFileObj = open(pdfName, 'rb')  # Open the pdf file
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj)  # Read info
                pdf_splitter2(pdfName, pdfReader)
                print('\nEl documento tiene: ' + str(pdfReader.numPages) +
                      ' páginas')
                break
            elif option == 0:  # SALIR
                break
            else:
                print()
                clear()
                print('Error, solo de aceptan numeros del 0 al 4')

        except ValueError:
            print("Error, ingrese solamente numeros")
Example #16
0
# tries to access all the pdfs in folder to make sure they
# were encrypted

import PyPDF2, os, sys
try:
    path, key = sys.argv[1:]
    os.chdir(path)

    # Searching for PDFs:
    for filename in os.listdir('.'):
        if filename.endswith('.pdf'):
            try:
                # Creating PDF reader and writer object:
                print('Encrypting %s...' % (filename))
                pdfFile = open(filename, 'rb')
                pdfReader = PyPDF2.PdfFileReader(pdfFile)
                encrypted = PyPDF2.PdfFileWriter()

                # Copying content:
                for pageNum in range(pdfReader.numPages):
                    page = pdfReader.getPage(pageNum)
                    encrypted.addPage(page)

                # Encrypting:
                encrypted.encrypt(key)

                # Creating new PDF:
                encryptedFile = open(filename[:-4] + '_encrypted.pdf', 'wb')
                encrypted.write(encryptedFile)
                encryptedFile.close()
                pdfFile.close()
import PyPDF2

# creating an object
file = open(
    '/home/jario/Documentos/ENEM - 1998 à 2017/(1998) ENEM/1998 - ENEM - Prova amarela.pdf',
    'rb')

# creating a pdf reader object
fileReader = PyPDF2.PdfFileReader(file)

# print the number of pages in pdf file
print(fileReader.numPages)

print(fileReader.getPage(0).extractText())
Example #18
0
import PyPDF2
import json
import re

pdf_file = open('data.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(0)
page_content = page.extractText()
mcq = re.compile

data = json.dumps(page_content)
with open("sample.json", "w") as file:
    file.write(data)
Example #19
0
filePath = r"C:\Users\alene\Desktop\Random PDFs"
passwort = 'chamberlain'
for folder, subfolders, files in os.walk(filePath):
    logging.debug("Current folder: " + folder)

    if subfolders != []:
        logging.debug("This folder's subfolders are: ")
        for folderName in subfolders:
            logging.debug(folderName)
    if files != []:
        logging.debug("This folder's files: ")
        for fileName in files:
            logging.debug(fileName)
            if fileName.endswith('_encrypted.pdf'):
                encryptedPdf = open(folder + "\\" + fileName, 'rb')
                if PyPDF2.PdfFileReader(encryptedPdf).isEncrypted:
                    if PyPDF2.PdfFileReader(encryptedPdf).decrypt(
                            passwort) == 1:
                        print("Deleting original unencrypted file: " +
                              fileName[:-14] + ".pdf")
                        # send2trash.send2trash(folder + "\\" + fileName[:-14] + ".pdf")
                continue
            if fileName.endswith('.pdf'):
                print("ENCRYPTING: " + fileName)
                pdfFile = open(folder + '\\' + fileName, 'rb')
                pdfRead = PyPDF2.PdfFileReader(pdfFile)
                pdfWrite = PyPDF2.PdfFileWriter()
                for pageNum in range(pdfRead.numPages):
                    pdfWrite.addPage(pdfRead.getPage(pageNum))
                pdfWrite.encrypt(passwort)
                resultPDF = open(
import PyPDF2
f = open(
    'C:\\Users\\plape\\OneDrive\\Escritorio\\Python\\Python-Bootcamp-notas-ejercicios\\15working with PDF & Spreadsheet CSV\\Working_Business_Proposal.pdf',
    'rb')  #rb es read binary
pdf_reader = PyPDF2.PdfFileReader(f)  #para que lea el pdf
pdf_reader.numPages  #cuando lo llamo deberia dar la cantidad de paginas q tiene si es q funciona
page_one = pdf_reader.getPage(0)  #primera pagina
page_one_text = page_one.extractText()  #esto me lo da como un python string
page_one_text
#a veces pasa q cuando pedis las paginas te lo da pero cuando pedis el texto te da empty strings, si me da empty strings el pdf file no es compatible con pdf2
f.close()

#adding to pdf files
f = open(
    'C:\\Users\\plape\\OneDrive\\Escritorio\\Python\\Python-Bootcamp-notas-ejercicios\\15working with PDF & Spreadsheet CSV\\Working_Business_Proposal.pdf',
    'rb')
pdf_reader = PyPDF2.PdfFileReader(f)
first_page = pdf_reader.getPage(0)
pdf_writer = PyPDF2.PdfFileWriter()
#para addPage, lo q quiero agregar deberia ser un pdfPageObject no un python string, por ejemplo:
type(first_page)  #esto es un ejemplo
pdf_writer.addPage(first_page)
pdf_output = open(
    'C:\\Users\\plape\\OneDrive\\Escritorio\\Python\\Python-Bootcamp-notas-ejercicios\\15working with PDF & Spreadsheet CSV\\Some_BrandNew_Doc.pdf',
    'wb')  #overwrites another pdf file with this name
pdf_writer.write(pdf_output)
f.close()
pdf_output.close()

#quiero all text dentro de un pdf file
f = open(
import PyPDF2, os

passwd = input('please input decrypted password:'******''
for folderName, subfolders, filenames in os.walk('.'):
    #读取文件下所有的pdf文件
    for filename in filenames:
        if filename.endswith('.pdf'):
            filedirname = folderName + '/' + filename
            pdfReader = PyPDF2.PdfFileReader(open(filedirname, 'rb'))

            if pdfReader.isEncrypted == True:  #判断pdf是否被加密
                try:
                    pdfReader.decrypt(passwd)  #解密pdf
                    numPages = pdfReader.numPages  #读取pdf页数

                except PyPDF2.utils.PdfReadError:  #读取失败说明解密失败,报错
                    print(filename + ", decrypted password is not correct.")
                else:  #解密成功,复制pdf内容,生成新文件
                    pdfWriter = PyPDF2.PdfFileWriter()
                    for pageNum in range(numPages):
                        pdfWriter.addPage(pdfReader.getPage(pageNum))
                    filename_decrypt = filedirname[:-14] + '_decrypted' + '.pdf'
                    resultPdf = open(filename_decrypt, 'wb')
                    pdfWriter.write(resultPdf)
                    resultPdf.close()
Example #22
0
pip install pdf2image
conda install -c conda-forge poppler
(or apt-get install -y poppler-utils) 
"""

from pdf2image import convert_from_path
import pickle
import PyPDF2 as pdf
from nltk.tokenize import RegexpTokenizer
from nltk import Text

#input pdf
pdf_name = "C:/Users/jacqu/Documents/DDEFI/Projet WeFinn/Bilan d_entreprises/Airbus Annual Report 2019.pdf"

file = open(pdf_name, 'rb')
pdf_reader = pdf.PdfFileReader(file)
tokenizer = RegexpTokenizer(r'\w+')

keyword = 'co'
page = []
sol = []

for i in range(pdf_reader.getNumPages()):
    raw = pdf_reader.getPage(i).extractText()
    token = tokenizer.tokenize(raw)
    text = Text(token)
    word = [w.lower() for w in text]
    while keyword in word:
        index = word.index(keyword)
        if word[index + 1] == '2':
            page.append(i)
Example #23
0
#imports
import PyPDF2

try:
    # Adds a watermark to pdfs
    # This is the 1 line way of opening files
    fileToWatermark = PyPDF2.PdfFileReader(open('test.pdf', 'rb'))
    watermark = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb'))

    outFile = PyPDF2.PdfFileWriter()

    for x in range(fileToWatermark.getNumPages()):

        page = fileToWatermark.getPage(x)

        # mergepage combines 2 pages's content into 1
        page.mergePage(watermark.getPage(0))  # watermark only has 1 page

        outFile.addPage(page)

    with open('outFile', 'rb') as output:
        outFile.write(output)

except FileNotFoundError:
    print("The file was not found")
Example #24
0
import PyPDF2

try:
    output = PyPDF2.PdfFileWriter()
    wtr_reader = PyPDF2.PdfFileReader(open("./pdf/wtr.pdf", "rb"))
    i_reader = PyPDF2.PdfFileReader(open("./pdf/00. Cover v0.2.pdf", "rb"))
    i = 0
    for i in range(i_reader.getNumPages()):
        page = i_reader.getPage(i)
        page.mergePage(wtr_reader.getPage(0))
        output.addPage(page)

    with open("./pdf/newfile.pdf", "wb") as f:
        output.write(f)


except FileNotFoundError:
    print("File not existed")

Example #25
0
def getPdfReaderObj(file_marker):
    pdf_rd_obj = PyPDF2.PdfFileReader(opnFile(file_marker))
    return pdf_rd_obj
Example #26
0
import PyPDF2

pdf = PyPDF2.PdfFileReader('data/src/pdf/sample1.pdf')

print(pdf.isEncrypted)
# False

src_pdf = PyPDF2.PdfFileReader('data/src/pdf/sample1.pdf')

dst_pdf = PyPDF2.PdfFileWriter()

dst_pdf.cloneReaderDocumentRoot(src_pdf)

print(src_pdf.documentInfo)
# {'/Title': IndirectObject(33, 0), '/Producer': IndirectObject(34, 0), '/Creator': IndirectObject(35, 0), '/CreationDate': IndirectObject(36, 0), '/ModDate': IndirectObject(36, 0)}

# dst_pdf.addMetadata(src_pdf.documentInfo)
# TypeError: createStringObject should have str or unicode arg

d = {key: src_pdf.documentInfo[key] for key in src_pdf.documentInfo.keys()}

print(d)
# {'/Title': 'sample1', '/Producer': 'macOS バージョン10.14.2(ビルド18C54) Quartz PDFContext', '/Creator': 'Keynote', '/CreationDate': "D:20190114072947Z00'00'", '/ModDate': "D:20190114072947Z00'00'"}

dst_pdf.addMetadata(d)

dst_pdf.encrypt('password')

with open('data/temp/sample1_pass.pdf', 'wb') as f:
    dst_pdf.write(f)
Example #27
0
def process_pdf(pdf_file_path, g, show_page_lists=False):

    print("Processing: {0}".format(os.path.basename(pdf_file_path)))

    # compile regular expressions for searches
    # state_re = re.compile("[a-z, A-Z][a-z, A-Z](?=_Bucket)")
    # bucket_re = re.compile("([0-9]|[0-9][a-z, A-z])(?=_Print)")
    wid_re = re.compile("\d{3}(AD)\d{4}|(W)\d{8}")
    date_string = datetime.datetime.strftime(datetime.datetime.today(),
                                             "%m%d%Y%H%M%S")
    #

    save_dir_name = ('jttocust100001_{timestamp}'.format(
        timestamp=date_string))

    # Add primary folder
    save_dir_name = os.path.join(save_dir_name, '0')

    # make a new directory to save results in
    if not os.path.exists(save_dir_name) and not show_page_lists:
        os.makedirs(save_dir_name)

    # open the pdf
    pdfFileObj = open(pdf_file_path, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    # make lists of all pages, pages to search WID on, last page of each record
    all_pages = set(i for i in range(0, pdfReader.numPages))
    wid_search_pages = set(i for i in range(0, pdfReader.numPages)
                           if (i % 2) == 0)
    doc_last_pages = set(i for i in range(0, pdfReader.numPages)
                         if (i % 2) == 1)

    # Yes, above could be done more efficiently (because each record is 2 pages),
    #   but I decided to go with an explicit list as a framework for projects
    #   with more than two pages per record.

    # a little condition for debugging
    if show_page_lists:
        print("** Debug page lists, full processing not done **")
        print("all pages: ", all_pages)
        print("name pages: ", wid_search_pages)
        print("last pages: ", doc_last_pages)
        pdfFileObj.close()
        exit()

    # initialize a couple of variables
    batch = PyPDF2.PdfFileWriter()
    extracted_wid = None
    seq = 0
    for n, i in enumerate(all_pages, 1):
        # where n is the iteratator count, i is the source pdf page number
        pageObj = pdfReader.getPage(i)
        batch.addPage(pdfReader.getPage(i))

        # Create secondary folder
        secondary_dir = int(seq / 100000)
        secondary_dir = os.path.join(save_dir_name,
                                     str.zfill(str(secondary_dir), 2))
        if not os.path.exists(secondary_dir):
            os.mkdir(secondary_dir)
        #

        if i in wid_search_pages:
            # search for text, save to variable
            text = pageObj.extractText()
            srch = wid_re.search(text)
            srch_cnt = wid_re.findall(text)

            if len(srch_cnt) > 2:
                print(("WARNING!!! Too Many Matches!!!: "
                       "{0} Record: {1}\n{2}\n\n".format(
                           os.path.basename(pdf_file_path), i, text)))

            if srch is not None:
                extracted_wid = srch[0]
            else:
                print("Skipping: {0} Record: {1}\n{2}\n\n".format(
                    os.path.basename(pdf_file_path), i, text))

        if (i in doc_last_pages) and (i != pdfReader.numPages):
            # write dat file, write out to pdf
            with open(
                    os.path.join(secondary_dir, "{0:0>5}001.pdf".format(seq)),
                    'wb') as output:
                batch.write(output)
            with open(
                    os.path.join(secondary_dir, "{0:0>5}IDX.dat".format(seq)),
                    'w') as datfile:
                datfile.write(
                    "{appid};1;;;;;;;;;;;{wid};0001;N;{year};{scan}\n".format(
                        wid=extracted_wid,
                        appid=g.appid,
                        scan=g.scan_date,
                        year=g.year))
            seq += 1
            batch = PyPDF2.PdfFileWriter()

    pdfFileObj.close()
            external TEXT
        )''')
        cur.execute('''REPLACE INTO ''' + '"' + str(roll_no) + '"' +''' (sub, internal, external) VALUES (?, ?, ?)''',(int(key),subs[key][0],subs[key][1]))
        conn.commit()
slist = list()
for sl in statusr:
    slist.append(sl)

for files in os.listdir('./res_pdf'):
    
    global pdfpath 
    pdfpath= os.path.join('res_pdf', files)
    
    if (pdfpath + '\n') in slist: continue

    pdfres = PyPDF2.PdfFileReader(open(pdfpath, 'rb'))
    pg = pdfres.getNumPages()
    global    j
    j = 0
    count = 0
    while j < pg:
        print('PDF NAME', pdfpath, j)
        try:
            res = pdfres.getPage(j)
        except:
            print("Done!")
        j = j + 1
        txt = res.extractText()
        txt = txt.strip()
        #print(txt)
        line = txt.split('\n')
Example #29
0
# search_string = input('Enter author name for search: ')
# search_results = dblp.search(search_string)

# if(search_results.empty):
#     print("No results...\nExtting...")
#     exit(1)

# for index, value in search_results.iterrows():
#     print(f"{index}): {value.Title}\n")

# option = int(input('Enter index of the record to get data: '))
# record = search_results.loc[option]

# request = requests.get(record.Link)

# html_soup = BeautifulSoup(request.text, 'html.parser')
# download_link = html_soup.find('a', class_='c-pdf-download__link')
# get_pdf = download_link.get('href')
# request = requests.get("https://" + get_pdf[2:])

# with open("".join("PDF.pdf"), "wb" ) as pdf_file:
#     pdf_file.write(request.content)

pdfFileObj = open('PDF.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
numOfPages = pdfReader.numPages

with open("sample.txt", "w") as txt_file:
    for i in range(numOfPages):
        txt_file.write(pdfReader.getPage(i).extractText())
        txt_file.write(' \n')
 from urllib2
import urlopen

from bs4
import BeautifulSoup

# Read the URL and save text in html1 and then in text.
url1 = "https://www.theguardian.com/politics/2018/sep/20/the-death-of-consensus-how-conflict-came-back-to-politics"
html1 = urlopen(url1).read().decode('utf8')
BeautifulSoup(html1).get_text()
soup = BeautifulSoup(html1, 'lxml')

# Read the PDF and save text in pdfString.
url2 = "http://eprints.lse.ac.uk/86880/7/Cox_Rise%20of%20populism%20published_2018.pdf"
pdf2 = open(url2, 'rb')
fileReader = PyPDF2.PdfFileReader(pdf2)

pdfString = ""
for x in range(11):
 pageObj = fileReader.getPage(x)
pdfString = pdfString + pageObj.extractText()

# Print text from url2.I closed the text, but You can open it.#print(pdfString)

text = ""
for element in soup.find_all(['title', 'p']): #print(element.text)
text = text + element.text

# At this point, there are text and pdfString.

# Print text from url1.I closed the text, but You can open it.#print(text)