Python PDFDocument Beispiele

Programmiersprache: Python

Namespace / Paketname: pdfparser

Klasse / Typ: PDFDocument

Beispiele auf hotexamples.com: 5

Python PDFDocument - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pdfparser.PDFDocument, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

PDFDocument(2)

get_pages(2)

initialize(2)

set_parser(2)

Beispiel #1

Datei anzeigen

def process_pdf(rsrcmgr,
                device,
                fp,
                pagenos=None,
                maxpages=0,
                password='',
                check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if check_extractable and not doc.is_extractable:
        raise PDFTextExtractionNotAllowed(
            'Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for (pageno, page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno + 1: break
    return

Beispiel #2

Datei anzeigen

Datei: pdf2txt.py Projekt: abduld/pdftoref

def pdf2txt(outfp, rsrc, fname, pages, codec, password='', debug=0):
  device = TextConverter(rsrc, debug=debug)
  doc = PDFDocument(debug=debug)
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp, debug=debug)
  try:
    doc.initialize(password)
  except PDFPasswordIncorrect:
    raise TextExtractionNotAllowed('incorrect password')
  if not doc.is_extractable:
    raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
  interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
  outfp.write('<document>\n')
  for (i,page) in enumerate(doc.get_pages(debug=debug)):
    if pages and (i not in pages): continue
    device.reset()
    interpreter.process_page(page)
    device.dump(outfp, codec)
  fp.close()
  device.close()
  outfp.write('</document>\n')
  return

Beispiel #3

Datei anzeigen

def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return

Beispiel #4

Datei anzeigen

Datei: pdfinterp.py Projekt: ktisha/ebook-service

def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    interpreter = PDFPageInterpreter(rsrc, device)
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return

Beispiel #5

Datei anzeigen

Datei: pdfinterp_altered.py Projekt: srbbins/ETD_Processing_Scripts

def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
                caching=True, check_extractable=True):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(caching=caching)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if check_extractable and not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for (pageno,page) in enumerate(doc.get_pages()):
        if pagenos and (pageno not in pagenos): continue
        interpreter.process_page(page)
        if maxpages and maxpages <= pageno+1: break
    return