def pdf_2_text(pdf_file_name: str, password='', page_numbers=None, maxpages=0, caching=True, laparams=None) -> str: """ This is a re-write of the function pdfminer.high_level.extract_text https://github.com/pdfminer/pdfminer.six/blob/0b44f7771462363528c109f263276eb254c4fcd0/pdfminer/high_level.py#L90 It produces result, which does not have this issue: https://github.com/pdfminer/pdfminer.six/issues/466 : pdf_file_name - name of the input PDF file : password: For encrypted PDFs, the password to decrypt. : page_numbers: zero-indexed page numbers to operate on : maxpages: How many pages to stop parsing after : """ result = "" with open_filename(pdf_file_name, "rb") as pdf_file_object: for page in PDFPage.get_pages( pdf_file_object, page_numbers, maxpages=maxpages, password=password, caching=caching, ): result = result + _PDFpage2txt(page, laparams) return result
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None): """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: a string containing all of the text extracted. """ if laparams is None: laparams = None with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page) return output_string.getvalue()
def extract_text_by_page(pdf_file, password='', page_numbers=None, maxpages=0, caching=True, codec='utf-8', laparams=None): """ Parse and return the text contained in each page of a PDF file. Taken from https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py#L90-L123 and adapted to return the text of each page separately as a dictionary obj. :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: a dict containing the text from each page (keys = page numbers) """ if laparams is None: laparams = LAParams() text_by_page = {} with open_filename(pdf_file, "rb") as fp: rsrcmgr = PDFResourceManager() pages_iterable = PDFPage.get_pages(fp, page_numbers, maxpages=maxpages, password=password, caching=caching) if page_numbers is None: tuples_iterable = enumerate(pages_iterable) else: tuples_iterable = zip(page_numbers, pages_iterable) for page_num, page in tuples_iterable: # print('Processing page_num', page_num) with StringIO() as output_string: device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) text_by_page[page_num] = output_string.getvalue() return text_by_page
def test_string_input(self): filename = absolute_sample_path("simple1.pdf") opened = open_filename(filename) assert_equal(opened.closing, True)
def test_file_input(self): filename = absolute_sample_path("simple1.pdf") with open(filename, "rb") as in_file: opened = open_filename(in_file) assert_equal(opened.file_handler, in_file)
def test_pathlib_input(self): filename = pathlib.Path(absolute_sample_path("simple1.pdf")) opened = open_filename(filename) assert_equal(opened.closing, True)