def pdf_contains(self, pdf_haystack, needle):
     # PDFs have magic byte %PDF
     self.assertEqual('%PDF', pdf_haystack[:4])
     # Check (with pdfquery)
     # that our real content is in the PDF, not just an error page
     parsed = PDFQuery(StringIO(pdf_haystack))
     # pdfquery chokes on these document attributes.
     # Let's remove NULL bytes from them
     parsed.doc.info[0]['Creator'] = 'test'
     parsed.doc.info[0]['Producer'] = 'Plone and phantomjs'
     parsed.doc.info[0]['Title'] = 'A valid title'
     parsed.load()
     self.assertTrue(parsed.pq(':contains("%s")' % needle))
 def pdf_contains(self, pdf_haystack, needle):
     # PDFs have magic byte %PDF
     self.assertEqual('%PDF', pdf_haystack[:4])
     # Check (with pdfquery)
     # that our real content is in the PDF, not just an error page
     parsed = PDFQuery(StringIO(pdf_haystack))
     # pdfquery chokes on these document attributes.
     # Let's remove NULL bytes from them
     parsed.doc.info[0]['Creator'] = 'test'
     parsed.doc.info[0]['Producer'] = 'Plone and phantomjs'
     parsed.doc.info[0]['Title'] = 'A valid title'
     parsed.load()
     self.assertTrue(parsed.pq(':contains("%s")' % needle))
def extract_page_layouts(file):
    """
    Extracts LTPage objects from a pdf file.
    modified from: http://www.degeneratestate.org/posts/2016/Jun/15/extracting-tabular-data-from-pdfs/
    Tests show that using PDFQuery to extract the document is ~ 5 times faster than pdfminer.
    """
    laparams = LAParams()

    with open(file, mode='rb') as pdf_file:
        print("Open document %s" % pdf_file.name)
        document = PDFQuery(pdf_file).doc

        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        layouts = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layouts.append(device.get_result())

    return layouts
Exemple #4
0
    def __init__(self, file):
        """

        Args:
            file (str, File): A path to a PDF file, or a file-like object that
                represents a pdf document.

        Raises:
            IOError: If a file path is specified and the file is not found.
            InvalidPDFError: If the specified file is not a PDF.
        """
        self._data = None
        self._variant = None  # TODO: Is this needed?

        self._check_configuration()

        try:
            self._file = PDFQuery(file)
        except PDFSyntaxError:
            raise InvalidPDFError("The provided file doesn't seem to be a valid PDF document")  # noqa
def load_pdf(file: str, cache_dir: str = "./.cache/") -> PDFQuery:
    """Loads and parses a PDF file with pdfquery.

    Parameters
    ----------
    file : str
        Path to a PDF file
    cache_dir : str, optional
        folder to store cache in, by default "./.cache/"

    Returns
    -------
    PDFQuery
        A PDFQuery object with the parsed PDF
    """
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    pdf = PDFQuery(file, parse_tree_cacher=FileCache(cache_dir))
    pdf.load()
    return pdf
Exemple #6
0
# python 3 script to parse the final result of the Tweede Kamerverkiezingen 2017 into csv from the pdf at
# https://www.kiesraad.nl/adviezen-en-publicaties/rapporten/2017/3/proces-verbaal-zitting-kiesraad-uitslag-tweede-kamerverkiezing-2017/getekend-proces-verbaal-zitting-bekendmaking-uitslag-tweede-kamer-22-03-2017
# usage: python pdfparse.py <input pdf> <output csv>
# requirements: pdfquery
# warning: uses a few GB ram when parsing the pdf for the first time
# license: apache v2

from sys import argv, stdout
from pdfquery import PDFQuery
from pdfquery.cache import FileCache
import csv
import re

pdf = PDFQuery(argv[1], parse_tree_cacher=FileCache("/tmp/"))
pdf.load()
# pdf.tree.write(argv[2], pretty_print=True)


def get(p):
    if len(p) != 1:
        return None
    else:
        return p[0]


f = open(argv[2], "wt", newline="")
kiescsv = csv.writer(f)
kiescsv.writerow(
    ["partijnummer", "partij", "volgnummer", "naam", "kieskring", "stemmen"])

kieskringen = {}
Exemple #7
0
def load_pdf(file, pages):
    pdf = PDFQuery(file,
                   parse_tree_cacher=FileCache(Config.APP_CACHE_FOLDER + '/'))
    pdf.load(*pages)
    return pdf
def extract_competencies(pdf: PDFQuery) -> List[Dict]:
    """Extracts Lernziele/Kompetenzen and Voraussetzungen from BHT modulehandbooks.

    Parameters
    ----------
    pdf : PDFQuery
        The PDF

    Returns
    -------
    List[Dict]
        List of extracted values as Dict
    """

    page_count = get_page_count(pdf)
    results: List[Dict] = []

    for i in range(page_count - 1):
        # Limit the extraction to the current page and only extract text
        selectors = [
            ('with_parent', 'LTPage[page_index="%s"]' % (i)),
            ('with_formatter', 'text'),
        ]

        # Try to find a "Modulnummer" on that page. If there is none, then it's
        # not a module-description page.
        try:
            selectors.append(
                get_selector_for_element_text(pdf, i, ("Modulnummer", ),
                                              ("Titel", ),
                                              (Point(120, 0), Point(490, 1)),
                                              "id"))
        except ValueError as err:
            eprint("No \"Modulnummer\" found on page %s, skipping..." %
                   (i + 1))
            continue

        # Find the module title
        try:
            selectors.append(
                get_selector_for_element_text(pdf, i, ("Titel", ),
                                              ("Leistungspunkte", "Credits"),
                                              (Point(120, 0), Point(490, 1)),
                                              "name"))
        except ValueError as err:
            eprint("Error parsing \"Titel\": %s" % (err))

        # Find the module competencies
        try:
            selectors.append(
                get_selector_for_element_text(
                    pdf, i,
                    ("Lernziele / Kompetenzen", "Lernziele/Kompetenzen"),
                    ("Voraussetzungen", ), (Point(120, 0), Point(490, 1)),
                    "competencies"))
        except ValueError as err:
            eprint("Error parsing \"Lernziele / Kompetenzen\": %s" % (err))

        # Find the module requirements
        try:
            selectors.append(
                get_selector_for_element_text(pdf, i, ("Voraussetzungen", ),
                                              ("Niveaustufe", ),
                                              (Point(120, 0), Point(490, 1)),
                                              "requirements"))
        except ValueError as err:
            eprint("Error parsing \"Voraussetzungen\": %s" % (err))

        # Do the extraction
        page_results: Dict = pdf.extract(selectors)

        # Add the pagenumber for convenience reasons
        page_results['page'] = i + 1

        # Trim extrated text
        page_results['id'] = page_results['id'].strip()
        page_results['name'] = page_results['name'].strip()

        # Split the extracted sentences (which also does a trim to each
        # sentence)
        page_results['competencies'] = split_sentences(
            page_results['competencies'])
        page_results['requirements'] = split_sentences(
            page_results['requirements'])

        results.append(page_results)

    return results
def get_selector_for_element_text(pdf: PDFQuery,
                                  page: int,
                                  descriptors: Tuple[str],
                                  underlying_descriptors: Tuple[str],
                                  value_deviations: (Point, Point),
                                  desc: Optional[str] = None):
    """Extracts a text value from the given handbook based on descriptors

    The operation is based on a descriptor of the value to extract and an underlying descriptor used
    to calculate the bounding box of the value of interest on the page.
    You can use value_derivations to adjust the calculated bounding box.
    ┌───────────────────────┬──────────────────────────┐
    │ descriptor            │ This is the text we want │
    ├───────────────────────┼──────────────────────────┤
    │ underlying_descriptor │ uninteresting text       │
    └───────────────────────┴──────────────────────────┘

    Parameters
    ----------
    pdf : PDFQuery
        The PDF
    page : int
        The page to use
    descriptors : Tuple[str]
        A tuple of descriptors to search for on the page
    underlaying_descriptors : Tuple[str]
        A tuple of descriptors that follow the descriptors to search for on the page
    value_deviations : (Point, Point)
        A tuple with the length of 2 with derivation from initial calculation for start and ending of bbox (e.g. first column of table is smaller/bigger)
    desc : Optional[str], optional
        A description of the data you try to extract, by default None, uses found descriptor as default

    Returns
    -------
    Tuple
        A tuple with the descriptor and generated selector

    Raises
    ------
    ValueError
        If a the descriptor is not found on the page
    ValueError
        If a the underlying descriptor is not found on the page
    """

    for descriptor in descriptors:
        descriptor_element = pdf.pq(
            'LTPage[page_index="%s"] LTTextLineHorizontal:contains("%s")' %
            (page, descriptor))
        if len(descriptor_element) >= 1:
            break

    if len(descriptor_element) < 1:
        raise ValueError("Descriptor \"%s\" not found on page %s" %
                         (descriptor, page + 1))

    for underlaying_descriptor in underlying_descriptors:
        underlaying_descriptor_element = pdf.pq(
            'LTPage[page_index="%s"] LTTextLineHorizontal:contains("%s")' %
            (page, underlaying_descriptor))
        if len(underlaying_descriptor_element) >= 1:
            break

    if len(underlaying_descriptor_element) < 1:
        raise ValueError("Underlaying descriptor \"%s\" not found on page %s" %
                         (underlaying_descriptor, page + 1))

    value_coords = (Point(
        float(descriptor_element.attr('x0')) + value_deviations[0].x,
        float(underlaying_descriptor_element.attr('y1')) +
        value_deviations[0].y),
                    Point(
                        float(descriptor_element.attr('x0')) +
                        value_deviations[1].x,
                        float(descriptor_element.attr('y1')) +
                        value_deviations[1].y))
    if desc is None:
        desc = descriptor.lower()

    return (desc, 'LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' %
            (value_coords[0].x, value_coords[0].y, value_coords[1].x,
             value_coords[1].y), lambda match: match.text().strip())
Exemple #10
0
 def get_pdf():
     return PDFQuery(os.path.join(pathlib.Path(__file__).parent.absolute(), 'current_afr_revenue_ruling.pdf'))
Exemple #11
0
class Document(object):

    # The variants of this document. A list of DocVariant subclasses. Files
    # will be checked against these variants to find an appropriate match.
    variants = []

    def __init__(self, file):
        """

        Args:
            file (str, File): A path to a PDF file, or a file-like object that
                represents a pdf document.

        Raises:
            IOError: If a file path is specified and the file is not found.
            InvalidPDFError: If the specified file is not a PDF.
        """
        self._data = None
        self._variant = None  # TODO: Is this needed?

        self._check_configuration()

        try:
            self._file = PDFQuery(file)
        except PDFSyntaxError:
            raise InvalidPDFError("The provided file doesn't seem to be a valid PDF document")  # noqa

    @property
    def data(self):
        """Read only property that is loaded with document data once
        `extract()` is called. This will be an instance of a DocVariant
        subclass.
        """
        return self._data

    def detect_variant(self):
        """Tests the loaded file against all variants specified in the
        ``variants`` attribute and returns the one that matches.

        Returns:
            One of the DocVariant subclasses specified in `self.variants`
            or None, if no suitable match is found.
        """
        variant_objs = [var(self._file) for var in self.variants]
        matched_variant = None
        for variant in variant_objs:
            if variant.load_test_fields_and_check_for_match():
                matched_variant = variant
                break
        return matched_variant

    def extract(self):
        """Loads up file, detects the variant of the document and performs
        extraction of fields. Extracted information is stored in ``self.data``.
        """
        # Load happens here (lazy)
        self._file.load()

        variant = self._variant = self.detect_variant()

        if variant is None:
            raise UnknownVariantError(
                'The specified file {file} could not be matched against any '
                'of the variants specified in `{cls_name}.variants`.\n If '
                'this is a new variant, please define an appropriate '
                'DocVariant subclass for it.'.format(
                    file=self._file.file.name, cls_name=self.__class__.__name__
                )
            )

        # Load completely
        variant.extract()

        self._data = variant

    def _check_configuration(self):
        if not self.variants:
            raise ValueError(
                "The class '{name}' hasn't been configured with any variants."
                " Set {name}.variants to a list of DocVariant types.".format(
                    name=self.__class__.__name__
                )
            )
Exemple #12
0
def cli(pdf_path):
    pdf = PDFQuery(pdf_path)
    pdf.load()
    print(get_header_name(pdf))
Exemple #13
0
    def _test_pdfquery(self):
        """
        Test 4 - Using pdfquery.
        Source: https://github.com/jcushman/pdfquery
        """

        print(Colors.UNDERLINE +
              '________________________________________________\n' +
              Colors.ENDC)

        total_pages, errors, total_mining_time = [], [], []

        for index, pdf_file in enumerate(self.pdfs):
            index = index + 1

            filename = LibrariesTesting.strip_accents(
                os.path.basename(pdf_file))

            file_size = self.convert_size(self.get_file_size(pdf_file))

            try:
                start_time = time.time()

                with open(pdf_file, 'rb') as f:
                    reader = PDFQuery(f)

                    pages_count = reader.doc.catalog['Pages'].resolve(
                    )['Count']

                    end_time = time.time()

                    single_file_time = self.decimal_round.format(end_time -
                                                                 start_time)

                    total_mining_time.append(single_file_time)

                    mining_time = filename, single_file_time

                    self._save_mining_time(item=mining_time,
                                           test_type='pdfquery')

                    total_pages.append(pages_count)

                    print(
                        Colors.FAIL +
                        '[PDFQUERY] File {i}/{index}. Total pages: {pages_count} --> "{filename}" - {file_size}'
                        .format(i=index,
                                index=len(self.pdfs),
                                pages_count=pages_count,
                                filename=filename,
                                file_size=file_size) + Colors.ENDC)
            except (KeyError, AttributeError, TypeError, PDFSyntaxError,
                    PDFEncryptionError) as error:
                self._save_mining_time(item=(filename, self.default_time),
                                       test_type='pdfquery')

                errors.append(error)
                pass

        total_pages, total_errors = list(map(int, total_pages)), len(errors)

        list_set_errors, total_parsing_time = list(set(errors)), sum(
            list(map(float, total_mining_time)))

        pdfquery_total_pages = sum(total_pages)

        print(Colors.FAIL +
              '[PDFQUERY] Total pages count: {pdfquery_total_pages}'.format(
                  pdfquery_total_pages=pdfquery_total_pages) + Colors.ENDC)

        self.final_stats_dict.update(
            **{
                'pdfquery_total_pages': pdfquery_total_pages,
                'pdfquery_total_parsing_time': total_parsing_time,
                'pdfquery_errors': {
                    'count': total_errors,
                    'errors': list_set_errors
                },
            })