from pdfminer.pdfpage import PDFPage, PDFPageCreate from pdfminer.layout import LAParams, LTTextBoxHorizontal from pdfminer.high_level import extract_text with open('example.pdf', 'rb') as fp: for page in PDFPage.get_pages(fp): interpreter = PDFPageCreate(None, LAParams()) interpreter.process_page(page) layout = interpreter.get_result() # extract only horizontal text boxes from layout text_boxes = [tb for tb in layout if isinstance(tb, LTTextBoxHorizontal)] # extract text from each text box page_text = ''.join([tb.get_text() for tb in text_boxes]) print(page_text)
from pdfminer.pdfpage import PDFPage with open('example.pdf', 'rb') as fp: pages = list(PDFPage.get_pages(fp)) # get the number of pages in the PDF document page_count = len(pages) # check if the PDF document is encrypted is_encrypted = any([page.is_encrypted for page in pages])In both examples, we imported classes and functions from multiple packages in the PDFMiner library, including PDFPage, PDFPageCreate, LAParams, LTTextBoxHorizontal, and extract_text.