Example #1
0
def extracting_text_from_pdfs(pdf_file):
    
    import PyPDF2
    
    pdf_file_object = open(pdf_file, mode="rb")
    pdf_reader = PyPDF2.reader(pdf_file_object)
    
    page_object = pdf_reader.getPage(0)    
    print(page_object.extractText())
Example #2
0
import PyPDF2

file = open('ejemplo.pdf', 'rb')
pdfObj = PyPDF2.PdfFileReader(file)
pdfObj.numPages
page1 = pdfObj.getPage(0)
r = pdfObj.isEncrypted
print(r)

page1.cropBox.getWidth
d = page1.extractText()

pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(page1)
out = open('out.pdf', 'wb')
pdfWriter.write(out)
out.close()
Example #3
0
#import packages
import pyttsx3
import PyPDF2

book = open('Introduction_to_Machine_Learning.pdf', 'rb')  #read PDF
pdfReader = PyPDF2.PdfFileReader(book)
pages = pdfReader.numPages  #read pages in PDF
#print(pages)                                                    #no of pages in PDF
speaker = pyttsx3.init()  #create speaker
#page = pdfReader.getPage(9)                                     #read single page

for num in range(8, pages):  #read certain range of pdf
    page = pdfReader.getPage(num)
    text = page.extractText()
    speaker.say(text)

    speaker.runAndWait()
Example #4
0
def pdf_processor(pdf):
    pdf_reader = PyPDF2.PdfFileReader(pdf)
    page_obj = pdf_reader.getPage(0)
    info = page_obj.extractText()
    print(info)
Example #5
0
fileList = []
for filename in os.listdir(path):
    if filename.endswith(".pdf"):
        numOfPDFsFound += 1
        filename = os.path.join(path, filename)
        fileList.append(filename)

filecount = 1
foundFilesList = []
for filename in fileList:
    sys.stdout.flush()
    sys.stdout.write("\rChecking file:%s out of Total %s PDFs" %
                     (filecount, numOfPDFsFound))
    sys.stdout.flush()
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
    if pdfReader.isEncrypted:
        # print "The file is encrypted. Skipping"
        continue

    text = ""
    count = 0

    try:
        num_pages = pdfReader.numPages
        #The while loop will read each page
        while count < num_pages:
            pageObj = pdfReader.getPage(count)
            count += 1
            text += pageObj.extractText()
    except:
def getPageCount(file):
    pdfFileObj = open(file, 'rb')
    pdfReader = pdf.PdfFileReader(pdfFileObj)
    return pdfReader.numPages
Example #7
0
import PyPDF2
# reading pdf dta
file = open('sample.pdf', 'rb')
reader = PyPDF2.PdfFileReader(file)
pdfData = reader.getPage(0)
print(pdfData.extractText())
data = pdfData.extractText()
file.close()

# looking for text
assert "Mechanics1" in data, 'Not present'
print('it is present')
Example #8
0
def stub_to_print(src_stub_file_path, dst_print_file_path, track_selector,
                  orchestra):
    """
    :param Path src_stub_file_path:
    :param Path dst_print_file_path:
    :param ITrackSelector track_selector: the mechanism that computes the number of copies to do for each track
    :param Orchestra orchestra:
    :param dict(str, int) musician_count: gets the number of musicians for each musical intrument family
    :param TableOfContents or None stub_toc: if defined, gets the start page number for each track in the stub
    """
    stub_toc = get_stub_tracks(src_stub_file_path, orchestra)
    print(stub_toc)

    track_to_print_count = track_selector.get_track_to_copy(
        stub_toc.get_track_ids())
    print(track_to_print_count)
    dst_print_file_path.parent.mkdir(parents=True, exist_ok=True)

    with open(dst_print_file_path, 'wb') as print_file, open(
            dst_print_file_path.with_suffix('.log'), 'wt') as log_file:
        print_pdf = PyPDF2.PdfFileWriter()
        log_file.write("contents of print file %s :\n\n" % dst_print_file_path)
        with open(src_stub_file_path, 'rb') as stub_file:
            stub_pdf = PyPDF2.PdfFileReader(stub_file)

            sorted_tracks = [
                Track(track_id, orchestra)
                for track_id in track_to_print_count.keys()
            ]
            sorted_tracks.sort()
            ranges = []
            range_to_num_copies = {}
            range_to_tracks = {}
            for track in sorted_tracks:
                # for track_id, num_copies in track_to_print_count.iteritems().sorted():
                # track_id = track.get_id()
                num_copies = track_to_print_count[track.id]
                if num_copies > 0:
                    first_page_index = stub_toc.get_tracks_first_page_index(
                        [track])
                    last_page_index = stub_toc.get_tracks_last_page_index(
                        [track], stub_pdf.getNumPages())
                    print('adding %d copies of %s (pages %d-%d)' %
                          (num_copies, track.id, first_page_index,
                           last_page_index))
                    assert first_page_index <= last_page_index
                    assert last_page_index <= stub_pdf.getNumPages()
                    page_range = (first_page_index, last_page_index)
                    if page_range in ranges:
                        # this page range has already been encountered. This can happen when multiple tracks share the same pages (eg crash cymbals are on the same pages as suspended cybal)
                        if track.instrument.get_player() == 'percussionist':
                            # we don't want to duplicate these shared pages for each track so
                            # we make as many copies as the track that asks for the most
                            range_to_num_copies[page_range] = max(
                                range_to_num_copies[page_range], num_copies)
                            range_to_tracks[page_range].append(track.id)
                        else:
                            # here we're in the case of a page that contains 2 non percussion tracks (eg bassoon 1,2)
                            # these must be not be merged, but be treated as 2 separate copies :
                            # if we request 2 copies of bassoon 1 and 2 copies of bassoon 2, we want 4 copies of bassoon 1,2, not 2
                            range_to_num_copies[page_range] += num_copies
                            range_to_tracks[page_range].append(track.id)
                    else:
                        ranges.append(page_range)
                        range_to_num_copies[page_range] = num_copies
                        range_to_tracks[page_range] = [track.id]
            for page_range in ranges:
                (first_page_index, last_page_index) = page_range
                num_copies = range_to_num_copies[page_range]
                log_file.write(
                    "%d copies of %s\n" %
                    (num_copies, '/'.join(range_to_tracks[page_range])))
                # print(page_range, num_copies)
                for copy_index in range(num_copies):  # @UnusedVariable pylint: disable=unused-variable
                    for page_index in range(first_page_index,
                                            last_page_index + 1):
                        track_page = stub_pdf.getPage(
                            page_index - 1
                        )  # -1 to convert 1-based index into 0-based index
                        # print('adding page %d' % page_index)
                        print_pdf.addPage(track_page)

            log_file.write("\nunprinted tracks :\n\n")
            for label in stub_toc.get_track_ids():
                label_is_printed = False
                for tracks in range_to_tracks.values():
                    for track in tracks:
                        # print(track, label)
                        if track == label:
                            label_is_printed = True
                            break
                    if label_is_printed:
                        break
                if not label_is_printed:
                    log_file.write("no copies of %s\n" % label)
            print_pdf.write(print_file)
Example #9
0
def monster_manual_lookup(manual_file_path, monster):
    """
    function that opens and reads monster manual,
    allowing lookup of creature entries

    :param manual_file_path: file path for the monster
                             manual

    :param monster: monster that we are looking up in the
                    manual

    :return monster_text: a string containing all the text contained
                          in the page describing the monster in the
                          Monster Manual
    """
    # Prepare the monster string to match format in
    # the monster manual
    sub_words = monster.split(' ')
    capitalized_words = []
    for word in sub_words:
        first_char = word[0].upper()
        capitalized_word = first_char + word[1:]
        capitalized_words.append(capitalized_word)

    monster = " ".join(capitalized_words)

    # Open monster manual pdf:
    with open(manual_file_path, 'rb') as pdf:
        # Create a pdf reader object
        pdf_reader = PyPDF2.PdfFileReader(pdf)

        # Retrieve pdf outlines
        outlines = pdf_reader.outlines

        # Find indices that allow us to extract text about
        # chosen monster from monster manual
        index_list, key = find_index_by_value(outlines, monster)

        if not index_list:
            print("Error: {} not found in monster manual. Try different entry".
                  format(monster))
            return

        # Dive into lists to find text for the monster entry
        current_level = outlines[index_list[0]]
        #pdb.set_trace()
        for idx in index_list[1:]:
            current_level = current_level[idx]

        # Get the page id of the monster that we are interested in
        # (Note: this id is not the same as the actual page number
        # itself. We will need to run a conversion method to extract
        # the number from the id that we are given
        page_id = current_level.page.idnum

        page_id_to_number_dict, _ = assign_page_id_to_number(pdf_reader)

        page_number = page_id_to_number_dict[page_id]

        # Print out creature text for parsing purposes
        page = pdf_reader.getPage(page_number)
        text = page.extractText().replace('\n', '')
        print("creature text: ", text)

        return page_number
Example #10
0
def main():
    sourceName, outputFolder, targetPage = parseParam()
    fileBase = os.path.splitext(os.path.basename(sourceName))[0]
    pdfObj = PyPDF2.PdfFileReader(open(sourceName, "rb"))
    for iPage in range(0, pdfObj.numPages):
        pageObj = pdfObj.getPage(iPage)

        if targetPage and (iPage + 1 != targetPage):
            print("Skip page {}.".format(iPage + 1))
            continue
        print("Processing page {} of {}...".format(iPage + 1, pdfObj.numPages))

        try:
            xObject = pageObj['/Resources']['/XObject'].getObject()
        except KeyError:
            continue

        iImage = 0
        for obj in xObject:
            if xObject[obj]['/Subtype'] == '/Image':
                iImage += 1
                title = obj[1:]
                fileName = "{2}_p{0:0>3}_{3}".format(iPage + 1, iImage,
                                                     fileBase, title)
                outFileName = os.path.join(outputFolder, fileName)

                size = (xObject[obj]['/Width'], xObject[obj]['/Height'])

                colorSpace = xObject[obj]['/ColorSpace']
                if colorSpace == '/DeviceRGB':
                    mode = "RGB"
                elif colorSpace == '/DeviceCMYK':
                    mode = "CMYK"
                elif colorSpace == '/DeviceGray':
                    mode = "L"
                elif colorSpace[0] == "/Indexed":
                    mode = "P"
                    colorSpace, base, hival, lookup = [
                        v.getObject() for v in colorSpace
                    ]
                    palette = lookup.getData()
                elif colorSpace[0] == "/ICCBased":
                    mode = "P"
                    lookup = colorSpace[1].getObject()
                    palette = lookup.getData()
                elif colorSpace[0] == "/DeviceN":
                    # UNKNOWN TYPE
                    mode = "P"
                    palette = DEFAULT_PALETTE
                else:
                    print("[ERROR] Unknown mode: {}".format(colorSpace))
                    continue

                    mode = "P"
                    if type(filters) is PyPDF2.generic.ArrayObject:
                        lookup = colorSpace[1].getObject()
                        palette = lookup.getData()
                        print("[FILE]" + fileName + " [MODE] " +
                              colorSpace[0] + " [FILTER]" +
                              xObject[obj]['/Filter'])
                    else:
                        palette = DEFAULT_PALETTE
                        print("[FILE]" + fileName + " [MODE]: " + colorSpace +
                              " [FILTER]" + xObject[obj]['/Filter'])

                try:
                    stream = xObject[obj]
                    data = stream._data
                    filters = stream.get("/Filter", ())
                    if type(filters) is not PyPDF2.generic.ArrayObject:
                        filters = [filters]
                    leftFilters = copy.deepcopy(filters)

                    if data:
                        for filterType in filters:
                            if filterType == "/FlateDecode" or filterType == "/Fl":
                                data = FlateDecode.decode(
                                    data, stream.get("/DecodeParms"))
                                leftFilters.remove(filterType)
                            elif filterType == "/ASCIIHexDecode" or filterType == "/AHx":
                                data = ASCIIHexDecode.decode(data)
                                leftFilters.remove(filterType)
                            elif filterType == "/LZWDecode" or filterType == "/LZW":
                                data = LZWDecode.decode(
                                    data, stream.get("/DecodeParms"))
                                leftFilters.remove(filterType)
                            elif filterType == "/ASCII85Decode" or filterType == "/A85":
                                data = ASCII85Decode.decode(data)
                                leftFilters.remove(filterType)
                            elif filterType == "/Crypt":
                                decodeParams = stream.get("/DecodeParams", {})
                                if "/Name" not in decodeParams and "/Type" not in decodeParams:
                                    pass
                                else:
                                    raise NotImplementedError(
                                        "/Crypt filter with /Name or /Type not supported yet"
                                    )
                                leftFilters.remove(filterType)
                            elif filterType == ():
                                leftFilters.remove(filterType)

                        # case of Flat image
                        if len(leftFilters) == 0:
                            img = Image.frombytes(mode, size, data)
                            if mode == "P":
                                img.putpalette(palette)
                            if mode == "CMYK":
                                img = img.convert('RGB')
                            img.save(outFileName + ".png")

                        # case of JPEG
                        elif len(leftFilters
                                 ) == 1 and leftFilters[0] == '/DCTDecode':
                            jpgData = BytesIO(data)
                            img = Image.open(jpgData)
                            if mode == "CMYK":
                                # case of CMYK invert all channel

                                # imgData = list(img.tobytes())
                                # invData = [(255 - val) & 0xff for val in imgData]
                                # data = struct.pack("{}B".format(len(invData)), *invData)
                                # img = Image.frombytes(img.mode, img.size, data)

                                imgData = np.frombuffer(img.tobytes(),
                                                        dtype='B')
                                invData = np.full(imgData.shape,
                                                  255,
                                                  dtype='B')
                                invData -= imgData
                                img = Image.frombytes(img.mode, img.size,
                                                      invData.tobytes())
                            img.save(outFileName + ".jpg")

                        # case of JPEG2000
                        elif len(leftFilters
                                 ) == 1 and leftFilters[0] == '/JPXDecode':
                            img = open(outFileName + ".jp2", "wb")
                            img.write(data)
                            img.close()

                        # case of TIFF
                        elif len(leftFilters) == 1 and leftFilters[
                                0] == '/CCITTFaxDecode':
                            if xObject[obj]['/DecodeParms']['/K'] == -1:
                                CCITT_group = 4
                            else:
                                CCITT_group = 3
                            width = xObject[obj]['/Width']
                            height = xObject[obj]['/Height']
                            img_size = len(data)
                            tiff_header = tiff_header_for_CCITT(
                                width, height, img_size, CCITT_group)
                            with open(outFileName + ".tif", 'wb') as img_file:
                                img_file.write(tiff_header + data)

                        elif len(leftFilters) >= 1:
                            print("[WARING] Unknown filter: " + leftFilters)

                except Exception as ex:
                    print("[ERROR] " + fileName)
                    print("\t" + str(ex))
    print("Completed.")
Example #11
0
import PyPDF2
# Open PDF and Create PDF Object
pdf1File = open('meetingminutes.pdf', 'rb')
pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
pdf2File = open('meetingminutes2.pdf', 'rb')
pdf2Reader = PyPDF2.PdfFileReader(pdf2File)

# Create a new PDF Object
pdfWriter = PyPDF2.PdfFileWriter()

# Combine 2 PDF File
for pageNum in range(pdf1Reader.numPages):
    pageObj = pdf1Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)

for pageNum in range(pdf2Reader.numPages):
    pageObj = pdf2Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)

# Create PDF File
pdfOutputFile = open('combinedminutes.pdf', 'wb')
pdfWriter.write(pdfOutputFile)

# Close File
pdfOutputFile.close()
pdf1File.close()
pdf2File.close()
def pdf_list(names):
    merger = PyPDF2.PdfFileMerger()
    for pdf in names:
        merger.append(pdf)
    merger.write('merged.pdf')
Example #13
0
import pyttsx3
import PyPDF2

path = open('python4everybody.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(path)

for i in range(0, 244):
    from_page = pdfReader.getPage(i)
    text = from_page.extractText()
    speak = pyttsx3.init()
    speak.say(text)
    speak.runAndWait()



Example #14
0
    def extract_pdf_image(self, full_file_name: str):
        """Extract image files from the current pdf."""
        try:
            if os.path.isfile(full_file_name):
                # open the current pdf
                pdf_reader = PyPDF2.PdfFileReader(open(full_file_name, 'rb'))
                print(f'Current Pdf: {full_file_name}')
                # get the number of pages
                num_pages = pdf_reader.getNumPages()
                # create a dictionary for the current pdf
                current_pdf = {}
                # iterate through each page and extract the pdf's contents
                n = 0
                while n < num_pages:
                    try:
                        # get the current page
                        page = pdf_reader.getPage(n)
                        # get the xObject
                        xObject = page['/Resources']['/XObject'].getObject()
                        #text = page.extractText()
                        #print(f'Text size: {len(text)}')
                        # sub page counter
                        m = 0
                        for obj in xObject:
                            # if current object is an image
                            if xObject[obj]['/Subtype'] == '/Image':
                                size = (xObject[obj]['/Width'],
                                        xObject[obj]['/Height'])
                                data = xObject[obj]._data
                                if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
                                    mode = "RGB"
                                else:
                                    mode = "P"

                            # NOTE: extract .tiff images
                            if xObject[obj]['/Filter'] == '/CCITTFaxDecode':
                                # .tiff
                                # create a directory for the image

                                # set the image format
                                self.img_format = 'tiff'

                                pdf_name = os.path.basename(
                                    os.path.splitext(full_file_name)
                                    [0])  # current pdf
                                if not os.path.exists(
                                        os.path.join(
                                            self.__dict__['pdf_img_path'],
                                            pdf_name)):
                                    # create a directory for the current pdf
                                    new_dir = os.path.join(
                                        self.__dict__['pdf_img_path'],
                                        pdf_name)
                                    self.current_pdf_dir = new_dir
                                    os.mkdir(new_dir)
                                    time.sleep(4)

                                # NOTE: using the tiff struct method
                                if xObject[obj]['/DecodeParms']['/K'] == -1:
                                    self.CCITT_group = 4
                                else:
                                    self.CCITT_group = 3

                                width = xObject[obj]['/Width']
                                height = xObject[obj]['/Height']
                                data = xObject[obj]._data
                                img_size = len(data)
                                tiff_header = self.tiff_header_CCITT(
                                    width=width,
                                    height=height,
                                    img_size=img_size,
                                    CCITT_group=self.CCITT_group)
                                # save the image file
                                img_name = f'ImgFilePage{n}_{m}.tiff'
                                with open(os.path.join(new_dir, img_name),
                                          'wb') as img_file:
                                    img_file.write(tiff_header + data)
                                m += 1

                            # NOTE: extract .png images
                            elif xObject[obj]['/Filter'] == '/FlateDecode':
                                # .png

                                # set the image format
                                self.img_format = 'png'

                                # create a directory for the image
                                pdf_name = os.path.basename(
                                    os.path.splitext(full_file_name)
                                    [0])  # current pdf
                                if not os.path.exists(
                                        os.path.join(
                                            self.__dict__['pdf_img_path'],
                                            pdf_name)):
                                    # create a directory for the current pdf
                                    new_dir = os.path.join(
                                        self.__dict__['pdf_img_path'],
                                        pdf_name)
                                    self.current_pdf_dir = new_dir
                                    os.mkdir(new_dir)
                                    time.sleep(4)
                                # save the image file
                                img = Image.frombytes(mode, size, data)
                                img.save(
                                    os.path.join(new_dir,
                                                 f'ImgFilePage{n}_{m}.png'))
                                m += 1

                            # NOTE: extract .jpg images
                            elif xObject[obj]['/Filter'] == '/DCTDecode':
                                # .jpg

                                # set the image format
                                self.img_format = 'jpg'

                                # create a directory for the image
                                pdf_name = os.path.basename(
                                    os.path.splitext(full_file_name)
                                    [0])  # current pdf
                                if not os.path.exists(
                                        os.path.join(
                                            self.__dict__['pdf_img_path'],
                                            pdf_name)):
                                    # create a directory for the current pdf
                                    new_dir = os.path.join(
                                        self.__dict__['pdf_img_path'],
                                        pdf_name)
                                    self.current_pdf_dir = new_dir
                                    os.mkdir(new_dir)
                                    time.sleep(4)

                                # save the image file
                                img = open(
                                    os.path.join(new_dir,
                                                 f'ImgFilePage{n}_{m}.jpg'),
                                    "wb")
                                img.write(data)
                                img.close()
                                m += 1

                            # NOTE: extract .jp2 images
                            elif xObject[obj]['/Filter'] == '/JPXDecode':
                                # .jp2

                                # set the image format
                                self.img_format = 'jp2'

                                # create a directory for the image
                                pdf_name = os.path.basename(
                                    os.path.splitext(full_file_name)
                                    [0])  # current pdf
                                if not os.path.exists(
                                        os.path.join(
                                            self.__dict__['pdf_img_path'],
                                            pdf_name)):
                                    # create a directory for the current pdf
                                    new_dir = os.path.join(
                                        self.__dict__['pdf_img_path'],
                                        pdf_name)
                                    self.current_pdf_dir = new_dir
                                    os.mkdir(new_dir)
                                    time.sleep(4)

                                # save the image file
                                img = open(
                                    os.path.join(new_dir,
                                                 f'ImgFilePage{n}_{m}.jp2'),
                                    "wb")
                                img.write(data)
                                img.close()
                                m += 1

                            # NOTE: extract image from bytes
                            else:
                                # image from bytes
                                print(
                                    f'Pdf: {full_file_name} has no images on page: {n}'
                                )
                                m += 1
                        # increment the page counter
                        n += 1
                    except Exception as e:
                        print(
                            f'An error occurred extracting text from page: {n}'
                        )
                        print(e)
                        n += 1
        except OSError as e:
            print(
                f'OSError: An error occurred while trying to extract images from pdf: {full_file_name}'
            )
import PyPDF2
import time
from tqdm import tqdm

template = PyPDF2.PdfFileReader(open('dummypdf.pdf', 'rb'))
watermark = PyPDF2.PdfFileReader(open('png2pdf.pdf', 'rb'))

output = PyPDF2.PdfFileWriter()

for i in range(template.getNumPages()):
    page = template.getPage(i)
    page.mergePage(watermark.getPage(0))
    output.addPage(page)

    with open('watermarked_output.pdf', 'wb') as file:
        output.write(file)

    # Initial call to print 0% progress
    for i in tqdm(range(100)):
        time.sleep(0.005)
print("All PDF's are watermarked")
Example #16
0
def scan_to_stub(src_scanned_pdf_file_path,
                 dst_stub_pdf_file_path,
                 toc,
                 title,
                 orchestra,
                 stamp_descs=[],
                 page_info_line_y_pos=1.0):
    """
    creates musical score stub from a musical score raw scan :
    - adds a table of contents
    - adds a stamp
    - numbers the pages

    :param str src_scanned_pdf_file_path: the source file that is expected to contain the scanned musical scores
    :param str dst_stub_pdf_file_path: the destination file that is expected to contain the stub of musical scores
    :param TableOfContents toc:
    :param str title: musical piece title
    :param Orchestra orchestra: the inventory of musical instruments
    :param list(StampDesc) stamp_descs: description of the stamps to overlay on each page
    :param float page_info_line_y_pos: y position of the status line relative to the bottom of the page
    """
    assert len(toc.tracks) > 0
    assert isinstance(src_scanned_pdf_file_path, Path)
    assert isinstance(dst_stub_pdf_file_path, Path)
    # check that the track_ids in the toc are known
    for track_id in toc.get_track_ids():
        try:
            track = Track(track_id, orchestra)  # @UnusedVariable  pylint: disable=unused-variable
        except KeyError as e:  # pylint: disable=unused-variable
            raise Exception(
                "Failed to identify track id '%s'. Either its syntax is incorrect or the related instrument in not yet registered in the orchestra."
                % (track_id))

    # tmp_dir = tempfile.mkdtemp()
    tmp_dir = Path('/tmp/pymusco')
    tmp_dir.mkdir(parents=True, exist_ok=True)

    scanned_image_file_paths = []
    with open(src_scanned_pdf_file_path, 'rb') as src_pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(src_pdf_file)
        # pdfReader.numPages
        # 19
        for page_index in range(pdf_reader.numPages):
            print('page_index = %d' % page_index)
            page = pdf_reader.getPage(page_index)
            # image_file_path = extract_pdf_page_main_image(page, image_dir=tmp_dir, image_name=('page%03d' % page_index))
            image_file_path = extract_pdf_page(page,
                                               image_dir=tmp_dir,
                                               image_name=('page%03d' %
                                                           page_index))

            scanned_image_file_paths.append(image_file_path)
            # break

    images_to_pdf(
        StubContents(image_file_paths=scanned_image_file_paths,
                     toc=toc,
                     title=title,
                     stamp_descs=stamp_descs,
                     page_info_line_y_pos=page_info_line_y_pos),
        dst_stub_pdf_file_path)
Example #17
0
def fetch_wrapper(monster_manual_path, creature_page_number):
    """
    Wrapper function to iterate over page numbers until
    the desired creture trait we want to fetch has been found

    :param creature_page_number: page number for creature info in
                                 Monster Manual, determined by the
                                 monster_manual_lookup method

    :param pdf_reader: pdf reader object for the Monster Manual

    :return creature_dict: dictionary with all the traits that we are
                           interested in (i.e., attributes, hit_points,
                           etc.) as keys and stats for those traits as
                           values
    """
    # dict to hold all information we need about a creature
    creature_dict = {}

    # list of traits that we would like to fetch for creature
    trait_list = [
        'attributes',
        'armor_class',
        'hit_points',
        'alignment',
        'creature_type',
        'languages',
        'passive_perception',
    ]

    # Open up Monster Manual pdf and initialize a new reader
    with open(monster_manual_path, 'rb') as pdf:

        # Create a pdf reader object
        pdf_reader = PyPDF2.PdfFileReader(pdf)

        for trait in trait_list:
            # Initialize loop to grab all desired traits of a given creature
            pass_code = 0
            trait_page_number = creature_page_number

            # Initialize empty dict to keep track of traits we are unable to
            # fetch
            error_dict = {}

            # Initialize empty dict for atttributes
            if trait == 'attributes':
                attribute_dict = {}

            while not pass_code:

                trait_page = pdf_reader.getPage(trait_page_number)
                print("trait_page_number: ", trait_page_number)
                trait_text = trait_page.extractText().replace('\n', '')

                # Fetch attributes
                if trait == 'attributes':

                    attribute_dict, pass_code = fetch_attributes(
                        trait_text, attribute_dict=attribute_dict)

                # Fetch armor class
                elif trait == 'armor_class':

                    armor_class, pass_code = fetch_armor_class(trait_text)

                # Fetch hit points
                elif trait == 'hit_points':

                    hit_points, pass_code = fetch_hit_points(trait_text)

                # Fetch alignment
                elif trait == 'alignment':

                    alignment, pass_code = fetch_alignment(trait_text)

                elif trait == 'creature_type':

                    creature_type, pass_code = fetch_creature_type(
                        trait_text, alignment)
                    print("pass_score after creature_type: ", pass_code)

                elif trait == 'languages':
                    bag_of_words = trait_text.split()
                    languages, pass_code = fetch_known_languages(bag_of_words)

                elif trait == 'challenge':
                    bag_of_words = trait_text.split()
                    challenge, pass_code = fetch_challenge_rating(bag_of_words)

                elif trait == 'passive_perception':
                    pass

                # see if pass code has been issued. If not, increment page
                # number by one and fetch next page of text to shift through
                if not pass_code:
                    trait_page_number += 1

                    # If we have gone five pages without seeing anything, issue
                    # error code for attributes and continue
                    if (trait_page_number - creature_page_number) > 5:

                        # Issue error codes for specific attributes we were unable
                        # to fetch
                        if trait == 'attributes':
                            error_dict['attributes'] = {}

                            for key in attribute_dict.keys():

                                if not attribute_dict[key]:
                                    error_dict['attributes'][key] = 1

                                else:
                                    error_dict['attributes'][key] = 0

                        else:
                            error_dict[trait] = 0

                        # Since we have busted the five page limit, break the
                        # loop for this trait and move onto the next
                        break

    # Populate the creature_dict with traits that we have fetched
    creature_dict['attributes'] = attribute_dict
    creature_dict['armor_class'] = armor_class
    creature_dict['hit_points'] = hit_points
    creature_dict['alignment'] = alignment
    creature_dict['creature_type'] = creature_type
Example #18
0
import PyPDF2

template = PyPDF2.PdfFileReader( open( 'combined.pdf', 'rb' ) )
watermark = PyPDF2.PdfFileReader( open( 'wtr.pdf', 'rb' ) )
output = PyPDF2.PdfFileWriter()

for i in range( template.getNumPages() ):
	page = template.getPage( i )
	page.mergePage( watermark.getPage(0) )
	output.addPage( page )

	with open( 'combined_watermarked.pdf', 'wb' ) as file:
		output.write( file )
Example #19
0
import pyttsx3
import PyPDF2 as p2
# Get file handle, you want to read-aloud
file = open('./pdf/lipsum.pdf', 'rb')
# Read contents of file into book object
book = p2.PdfFileReader(file)
# Load first page (page 0 i.e. page 1) contents into page object
page = book.getPage(0)
# Store contents of page in a text object
text = page.extractText()
# Print the text for debugging purpose.
# Sometimes the pyttsx3 package is not able to read all types of pdf files
# You may comment it by addeding the # before the line below, it's your choice
print(text)
# Initialize the read-aloud python package
speaker = pyttsx3.init()
# Read aloud the pdf page selected
speaker.say(text)
# The line below is used to block program execution until the read-aloud command buffer / queue is cleared or read-out
speaker.runAndWait()
print('Program completed.')
Example #20
0
def extractData(file, page):
    pdfFileObj = open(file, 'rb')
    pdfReader = pdf.PdfFileReader(pdfFileObj)
    pageObj = pdfReader.getPage(page)
    return pageObj.extractText()
import PyPDF2
pdf = open("example2.pdf", "rb")
pdfRead = PyPDF2.PdfFileReader(pdf)
pdfPages = pdfRead.numPages
selectedPages = pdfRead.getPage(pdfPages - 1)
#pyPdf2 used with text data
text = selectedPages.extractText()

file = open(
    r"C:\Users\Raksh\Documents\GitHub\Python-Converting-Pdf-To-Text\text2.txt",
    "a")
file.writelines(text)
file.close()
print("Done Converting !!")
import PyPDF2 as pd

# filename = input('Path to the file: ')
filename = 'output.pdf'

file = open(filename, 'rb')
pdfReader = pd.PdfFileReader(file)

tried = 0

if not pdfReader.isEncrypted:
    print('The file is not encryted! You can successfully open it!')

else:
    wordListFile = open('dictionary.txt', 'r')
    body = wordListFile.read().lower()
    words = body.split('\n')

    for i in range(len(words)):
        word = words[i]
        print('Trying dencryption by: {}'.format(word))
        result = pdfReader.decrypt(word)
        if result == 1:
            print('Success! The password is: ' + word)
            break

        elif result == 0:
            tried += 1
            print('Passwords tried: ' + str(tried))
            continue
            
import PyPDF2
import pyttsx3

infile = open('Related/sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(infile)
num_Pages = pdfReader.numPages
print(num_Pages)

start = pyttsx3.init()
print("Playing audio..")

for i in range(0, num_Pages):
    page = pdfReader.getPage(i)
    text = page.extractText()
    start.say(text)
    start.runAndWait()
Example #24
0
import PyPDF2

# creating a pdf file object
pdfFileObj = open('filename.pdf', 'rb')

# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

# printing number of pages in pdf file
number_pages = pdfReader.numPages
print(pdfReader.numPages)

# creating a page object
count = 0
count_1 = 0
for i in range(2, number_pages):
    pageObj = pdfReader.getPage(i)

    # extracting text from page
    requiremt_extract = pageObj.extractText()
    print(requiremt_extract)
#    a=requiremt_extract.split('\n' or '.')
#    for section_number in a:
#        if len(section_number)<=3:
#            for dot in section_number:
#                if dot == '.':
#                    try:
#                        if(int(section_number[:-1])):
#                            count=int(section_number[:-1])
#                            print(section_number[:-1])
#
Example #25
0
import PyPDF2

pdfFile = open('sample.pdf', 'rb')

pdfReader = PyPDF2.pdfFileReader(pdfFile)

print(pdfReader.numPages)

pageObj = pdfReader.getPage(0)

print(pageObj.extractText())

pdfFile.close()
pdfFiles = []

for filename in os.listdir(directory_target):
    if filename.endswith('.pdf'):
        pdfFilePath = os.path.join(directory_target, filename)
        # pdfFiles.append(filename)
        pdfFiles.append(pdfFilePath)

pdfFiles.sort(
    key=str.lower)  # list sorted into alpha order with keyword argument

logging.debug('pdf files list after sorting')
logging.debug(pdfFiles)

pdfWriter = PyPDF2.PdfFileWriter(
)  # this is the new pdf temporary file, you will add pages to this

# loop through all the PDF files

for filename in pdfFiles:
    pdfFileObj = open(filename, 'rb')  # read in binary mode
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    # loop through all the pages (except the first) and add them
    for pageNum in range(
            1, pdfReader.numPages
    ):  # cycle through all pages except the first which is n = 0, hence start at n = 1
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)

# save the resulting PDF to a file
Example #27
0
import PyPDF2

template = PyPDF2.PdfFileReader(open('super_pdf.pdf', 'rb'))
watermark = PyPDF2.PdfFileReader(open('wtr.pdf', 'rb'))
output = PyPDF2.PdfFileWriter()

for i in range(template.getNumPages()):
    page = template.getPage(i)
    page.mergePage(watermark.getPage(0))
    output.addPage(page)
    with open('watermarked_output.pdf', 'wb') as file:
        output.write(file)
Example #28
0
def pdfMerge(pdf_list):
    merger = PyPDF2.PdfFileMerger()
    for pdf in pdf_list:
        merger.append(pdf)

    merger.write('Adrien Clay Resume Cover Letter.pdf')
Example #29
0
import PyPDF2

# This works fine
with open('demo.pdf', 'rb') as pdf_obj:
    pdf = PyPDF2.PdfFileReader(pdf_obj)
    out = PyPDF2.PdfFileWriter()
    for page in pdf.pages:
        # page.scale(2, 2)

        extracted = page.extractText()
        
        print(extracted, "\n\n")
        out.addPage(page)
        
    with open('new.pdf', 'wb') as f: 
        out.removeLinks()
        dir(out)
        out.write(f)

# # This attempts to remove annotations
# with open('old.pdf', 'rb') as pdf_obj:
#     pdf = PyPDF2.PdfFileReader(pdf_obj)
#     page = pdf.pages[2]
#     print(page['/Annots'], '\n\n\n\n')
#     page.Annots = []
#     print(page['/Annots'])
Example #30
0
import PyPDF2

# with open("resume_AaryamanSaini.pdf", "rb") as file:
#     reader = PyPDF2.PdfFileReader(file)
#     print(reader.numPages)
#     page = reader.getPage(0)
#     page.rotateClockwise(90)
#     writer = PyPDF2.PdfFileWriter()
#     writer.addPage(page)
#     with open("rotated.pdf", "wb") as output:
#         writer.write(output)

merger = PyPDF2.PdfFileMerger()
file_names = ["rotated.pdf", "resume_AaryamanSaini.pdf"]
for file_name in file_names:
    merger.append(file_name)

merger.write("combined.pdf")
Example #31
0
## Code to rotate a complete PDF file
import PyPDF2

file_name = '20200210225306926'
page_to_be_rotated = 1
pdf_in = open(f'{file_name}.pdf', 'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_in)
pdf_writer = PyPDF2.PdfFileWriter()

for pagenum in range(pdf_reader.numPages):
    page = pdf_reader.getPage(pagenum)

    page.rotateClockwise(180)
    pdf_writer.addPage(page)

pdf_out = open(f'{file_name}_rotated.pdf', 'wb')
pdf_writer.write(pdf_out)
pdf_out.close()
pdf_in.close()