def pdf_contains(self, pdf_haystack, needle): # PDFs have magic byte %PDF self.assertEqual('%PDF', pdf_haystack[:4]) # Check (with pdfquery) # that our real content is in the PDF, not just an error page parsed = PDFQuery(StringIO(pdf_haystack)) # pdfquery chokes on these document attributes. # Let's remove NULL bytes from them parsed.doc.info[0]['Creator'] = 'test' parsed.doc.info[0]['Producer'] = 'Plone and phantomjs' parsed.doc.info[0]['Title'] = 'A valid title' parsed.load() self.assertTrue(parsed.pq(':contains("%s")' % needle))
def extract_page_layouts(file): """ Extracts LTPage objects from a pdf file. modified from: http://www.degeneratestate.org/posts/2016/Jun/15/extracting-tabular-data-from-pdfs/ Tests show that using PDFQuery to extract the document is ~ 5 times faster than pdfminer. """ laparams = LAParams() with open(file, mode='rb') as pdf_file: print("Open document %s" % pdf_file.name) document = PDFQuery(pdf_file).doc if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layouts.append(device.get_result()) return layouts
def __init__(self, file): """ Args: file (str, File): A path to a PDF file, or a file-like object that represents a pdf document. Raises: IOError: If a file path is specified and the file is not found. InvalidPDFError: If the specified file is not a PDF. """ self._data = None self._variant = None # TODO: Is this needed? self._check_configuration() try: self._file = PDFQuery(file) except PDFSyntaxError: raise InvalidPDFError("The provided file doesn't seem to be a valid PDF document") # noqa
def load_pdf(file: str, cache_dir: str = "./.cache/") -> PDFQuery: """Loads and parses a PDF file with pdfquery. Parameters ---------- file : str Path to a PDF file cache_dir : str, optional folder to store cache in, by default "./.cache/" Returns ------- PDFQuery A PDFQuery object with the parsed PDF """ if not os.path.exists(cache_dir): os.makedirs(cache_dir) pdf = PDFQuery(file, parse_tree_cacher=FileCache(cache_dir)) pdf.load() return pdf
# python 3 script to parse the final result of the Tweede Kamerverkiezingen 2017 into csv from the pdf at # https://www.kiesraad.nl/adviezen-en-publicaties/rapporten/2017/3/proces-verbaal-zitting-kiesraad-uitslag-tweede-kamerverkiezing-2017/getekend-proces-verbaal-zitting-bekendmaking-uitslag-tweede-kamer-22-03-2017 # usage: python pdfparse.py <input pdf> <output csv> # requirements: pdfquery # warning: uses a few GB ram when parsing the pdf for the first time # license: apache v2 from sys import argv, stdout from pdfquery import PDFQuery from pdfquery.cache import FileCache import csv import re pdf = PDFQuery(argv[1], parse_tree_cacher=FileCache("/tmp/")) pdf.load() # pdf.tree.write(argv[2], pretty_print=True) def get(p): if len(p) != 1: return None else: return p[0] f = open(argv[2], "wt", newline="") kiescsv = csv.writer(f) kiescsv.writerow( ["partijnummer", "partij", "volgnummer", "naam", "kieskring", "stemmen"]) kieskringen = {}
def load_pdf(file, pages): pdf = PDFQuery(file, parse_tree_cacher=FileCache(Config.APP_CACHE_FOLDER + '/')) pdf.load(*pages) return pdf
def extract_competencies(pdf: PDFQuery) -> List[Dict]: """Extracts Lernziele/Kompetenzen and Voraussetzungen from BHT modulehandbooks. Parameters ---------- pdf : PDFQuery The PDF Returns ------- List[Dict] List of extracted values as Dict """ page_count = get_page_count(pdf) results: List[Dict] = [] for i in range(page_count - 1): # Limit the extraction to the current page and only extract text selectors = [ ('with_parent', 'LTPage[page_index="%s"]' % (i)), ('with_formatter', 'text'), ] # Try to find a "Modulnummer" on that page. If there is none, then it's # not a module-description page. try: selectors.append( get_selector_for_element_text(pdf, i, ("Modulnummer", ), ("Titel", ), (Point(120, 0), Point(490, 1)), "id")) except ValueError as err: eprint("No \"Modulnummer\" found on page %s, skipping..." % (i + 1)) continue # Find the module title try: selectors.append( get_selector_for_element_text(pdf, i, ("Titel", ), ("Leistungspunkte", "Credits"), (Point(120, 0), Point(490, 1)), "name")) except ValueError as err: eprint("Error parsing \"Titel\": %s" % (err)) # Find the module competencies try: selectors.append( get_selector_for_element_text( pdf, i, ("Lernziele / Kompetenzen", "Lernziele/Kompetenzen"), ("Voraussetzungen", ), (Point(120, 0), Point(490, 1)), "competencies")) except ValueError as err: eprint("Error parsing \"Lernziele / Kompetenzen\": %s" % (err)) # Find the module requirements try: selectors.append( get_selector_for_element_text(pdf, i, ("Voraussetzungen", ), ("Niveaustufe", ), (Point(120, 0), Point(490, 1)), "requirements")) except ValueError as err: eprint("Error parsing \"Voraussetzungen\": %s" % (err)) # Do the extraction page_results: Dict = pdf.extract(selectors) # Add the pagenumber for convenience reasons page_results['page'] = i + 1 # Trim extrated text page_results['id'] = page_results['id'].strip() page_results['name'] = page_results['name'].strip() # Split the extracted sentences (which also does a trim to each # sentence) page_results['competencies'] = split_sentences( page_results['competencies']) page_results['requirements'] = split_sentences( page_results['requirements']) results.append(page_results) return results
def get_selector_for_element_text(pdf: PDFQuery, page: int, descriptors: Tuple[str], underlying_descriptors: Tuple[str], value_deviations: (Point, Point), desc: Optional[str] = None): """Extracts a text value from the given handbook based on descriptors The operation is based on a descriptor of the value to extract and an underlying descriptor used to calculate the bounding box of the value of interest on the page. You can use value_derivations to adjust the calculated bounding box. ┌───────────────────────┬──────────────────────────┐ │ descriptor │ This is the text we want │ ├───────────────────────┼──────────────────────────┤ │ underlying_descriptor │ uninteresting text │ └───────────────────────┴──────────────────────────┘ Parameters ---------- pdf : PDFQuery The PDF page : int The page to use descriptors : Tuple[str] A tuple of descriptors to search for on the page underlaying_descriptors : Tuple[str] A tuple of descriptors that follow the descriptors to search for on the page value_deviations : (Point, Point) A tuple with the length of 2 with derivation from initial calculation for start and ending of bbox (e.g. first column of table is smaller/bigger) desc : Optional[str], optional A description of the data you try to extract, by default None, uses found descriptor as default Returns ------- Tuple A tuple with the descriptor and generated selector Raises ------ ValueError If a the descriptor is not found on the page ValueError If a the underlying descriptor is not found on the page """ for descriptor in descriptors: descriptor_element = pdf.pq( 'LTPage[page_index="%s"] LTTextLineHorizontal:contains("%s")' % (page, descriptor)) if len(descriptor_element) >= 1: break if len(descriptor_element) < 1: raise ValueError("Descriptor \"%s\" not found on page %s" % (descriptor, page + 1)) for underlaying_descriptor in underlying_descriptors: underlaying_descriptor_element = pdf.pq( 'LTPage[page_index="%s"] LTTextLineHorizontal:contains("%s")' % (page, underlaying_descriptor)) if len(underlaying_descriptor_element) >= 1: break if len(underlaying_descriptor_element) < 1: raise ValueError("Underlaying descriptor \"%s\" not found on page %s" % (underlaying_descriptor, page + 1)) value_coords = (Point( float(descriptor_element.attr('x0')) + value_deviations[0].x, float(underlaying_descriptor_element.attr('y1')) + value_deviations[0].y), Point( float(descriptor_element.attr('x0')) + value_deviations[1].x, float(descriptor_element.attr('y1')) + value_deviations[1].y)) if desc is None: desc = descriptor.lower() return (desc, 'LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (value_coords[0].x, value_coords[0].y, value_coords[1].x, value_coords[1].y), lambda match: match.text().strip())
def get_pdf(): return PDFQuery(os.path.join(pathlib.Path(__file__).parent.absolute(), 'current_afr_revenue_ruling.pdf'))
class Document(object): # The variants of this document. A list of DocVariant subclasses. Files # will be checked against these variants to find an appropriate match. variants = [] def __init__(self, file): """ Args: file (str, File): A path to a PDF file, or a file-like object that represents a pdf document. Raises: IOError: If a file path is specified and the file is not found. InvalidPDFError: If the specified file is not a PDF. """ self._data = None self._variant = None # TODO: Is this needed? self._check_configuration() try: self._file = PDFQuery(file) except PDFSyntaxError: raise InvalidPDFError("The provided file doesn't seem to be a valid PDF document") # noqa @property def data(self): """Read only property that is loaded with document data once `extract()` is called. This will be an instance of a DocVariant subclass. """ return self._data def detect_variant(self): """Tests the loaded file against all variants specified in the ``variants`` attribute and returns the one that matches. Returns: One of the DocVariant subclasses specified in `self.variants` or None, if no suitable match is found. """ variant_objs = [var(self._file) for var in self.variants] matched_variant = None for variant in variant_objs: if variant.load_test_fields_and_check_for_match(): matched_variant = variant break return matched_variant def extract(self): """Loads up file, detects the variant of the document and performs extraction of fields. Extracted information is stored in ``self.data``. """ # Load happens here (lazy) self._file.load() variant = self._variant = self.detect_variant() if variant is None: raise UnknownVariantError( 'The specified file {file} could not be matched against any ' 'of the variants specified in `{cls_name}.variants`.\n If ' 'this is a new variant, please define an appropriate ' 'DocVariant subclass for it.'.format( file=self._file.file.name, cls_name=self.__class__.__name__ ) ) # Load completely variant.extract() self._data = variant def _check_configuration(self): if not self.variants: raise ValueError( "The class '{name}' hasn't been configured with any variants." " Set {name}.variants to a list of DocVariant types.".format( name=self.__class__.__name__ ) )
def cli(pdf_path): pdf = PDFQuery(pdf_path) pdf.load() print(get_header_name(pdf))
def _test_pdfquery(self): """ Test 4 - Using pdfquery. Source: https://github.com/jcushman/pdfquery """ print(Colors.UNDERLINE + '________________________________________________\n' + Colors.ENDC) total_pages, errors, total_mining_time = [], [], [] for index, pdf_file in enumerate(self.pdfs): index = index + 1 filename = LibrariesTesting.strip_accents( os.path.basename(pdf_file)) file_size = self.convert_size(self.get_file_size(pdf_file)) try: start_time = time.time() with open(pdf_file, 'rb') as f: reader = PDFQuery(f) pages_count = reader.doc.catalog['Pages'].resolve( )['Count'] end_time = time.time() single_file_time = self.decimal_round.format(end_time - start_time) total_mining_time.append(single_file_time) mining_time = filename, single_file_time self._save_mining_time(item=mining_time, test_type='pdfquery') total_pages.append(pages_count) print( Colors.FAIL + '[PDFQUERY] File {i}/{index}. Total pages: {pages_count} --> "{filename}" - {file_size}' .format(i=index, index=len(self.pdfs), pages_count=pages_count, filename=filename, file_size=file_size) + Colors.ENDC) except (KeyError, AttributeError, TypeError, PDFSyntaxError, PDFEncryptionError) as error: self._save_mining_time(item=(filename, self.default_time), test_type='pdfquery') errors.append(error) pass total_pages, total_errors = list(map(int, total_pages)), len(errors) list_set_errors, total_parsing_time = list(set(errors)), sum( list(map(float, total_mining_time))) pdfquery_total_pages = sum(total_pages) print(Colors.FAIL + '[PDFQUERY] Total pages count: {pdfquery_total_pages}'.format( pdfquery_total_pages=pdfquery_total_pages) + Colors.ENDC) self.final_stats_dict.update( **{ 'pdfquery_total_pages': pdfquery_total_pages, 'pdfquery_total_parsing_time': total_parsing_time, 'pdfquery_errors': { 'count': total_errors, 'errors': list_set_errors }, })