Python PdfFileReader.getDocumentInfo Exemples, PyPDF3.PdfFileReader.getDocumentInfo Python Exemples

Exemple #1

0

Afficher le fichier

def extract_information(pdf_path):
    testread = ""
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        testread = pdf.getPage(92).extractText().strip()
        print(pdf.getPage(92).extractText().strip())
        number_of_pages = pdf.getNumPages()

    # txt = f"""
    # Information about {pdf_path}:

    # Author: {information.author}
    # Creator: {information.creator}
    # Producer: {information.producer}
    # Subject: {information.subject}
    # Title: {information.title}
    # Number of pages: {number_of_pages}
    # """
    print(testread)

    # define variables
    s = testread.strip()
    file = "file.mp3"

    # initialize tts, create mp3 and play
    tts = gTTS(s, 'en')
    tts.save(file)
    #os.system("mpg123 " + file)

    return information

Exemple #2

0

Afficher le fichier

def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    xmp_info = None

    if use_pdf_meta:
        with open(tmp_file_path, 'rb') as f:
            pdf_file = PdfFileReader(f)
            doc_info = pdf_file.getDocumentInfo()
            xmp_info = parse_xmp(pdf_file)

    if xmp_info:
        author = ' & '.join(split_authors(xmp_info['author']))
        title = xmp_info['title']
        subject = xmp_info['subject']
        tags = xmp_info['tags']
        languages = xmp_info['languages']
        publisher = xmp_info['publisher']
    else:
        author = u'Unknown'
        title = ''
        languages = [""]
        publisher = ""
        subject = ""
        tags = ""

    if doc_info:
        if author == '':
            author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown'
        if title == '':
            title = doc_info.title if doc_info.title else original_file_name
        if subject == '':
            subject = doc_info.subject or ""
        if tags == '' and '/Keywords' in doc_info:
            if isinstance(doc_info['/Keywords'], bytes):
                tags = doc_info['/Keywords'].decode('utf-8')
            else:
                tags = doc_info['/Keywords']
    else:
        title = original_file_name

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags=tags,
        series="",
        series_id="",
        languages=','.join(languages),
        publisher=publisher,
        pubdate="",
        identifiers=[])

Exemple #3

0

Afficher le fichier

def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(BytesIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo() or {}
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result

Exemple #4

0

Afficher le fichier

Fichier : pdfmeta.py Projet : c002/BabySploit

def start():
    from PyPDF3 import PdfFileReader
    import glob
    print("Put PDF file in pdfs/")
    print("Which PDF file would you like to read the meta data for?")
    for d in glob.iglob("pdfs/*"):
        if "emptyfile" not in d:
            print(d.replace("pdfs/"))
    ans = str(input("> "))
    if ".pdf" in ans:
        pass
    else:
        ans = ans + ".pdf"
    pdffile = PdfFileReader(file=(ans, 'rb'))
    docInfo = pdffile.getDocumentInfo()
    for metaItem in docInfo:
        print("- " + metaItem + ":" + docInfo[metaItem])
    print("\n")

Exemple #5

0

Afficher le fichier

def pdfHandler(file_dir):
    input1 = PdfFileReader(open(file_dir, 'rb'))

    print('document1.pdf has {} pages.'.format(str(input1.getNumPages())))

    fields = input1.getFields()
    print(type(fields))

    documentInfo = input1.getDocumentInfo()
    print(type(documentInfo))
    if documentInfo is not None:
        for key in documentInfo.keys():
            print('{} : {}'.format(key, documentInfo.get(key)))

    metaData = input1.getXmpMetadata()
    print(type(metaData))
    if metaData is not None:
        # print(metaData)
        for relation in metaData.dc_relation:
            print('relation: {}'.format(relation))

Exemple #6

0

Afficher le fichier

Fichier : admin_views.py Projet : bikeshedder/shark

def invoice_pdf(request, number, correction=False):
    invoice = get_object_or_404(Invoice, number=number)
    if correction:
        invoice = invoice.correction
    from reportlab.lib.units import mm
    from reportlab.platypus import Paragraph
    from reportlab.platypus.flowables import Spacer
    from reportlab.platypus.flowables import KeepTogether

    from dinbrief.document import Document
    from dinbrief.invoice import ItemTable, TotalTable
    from dinbrief.styles import styles
    from dinbrief.template import BriefTemplate

    with trans_override(invoice.language):

        response = HttpResponse(content_type='application/pdf')
        if 'download' in request.GET:
            filename = '%s.pdf' % invoice.number
            response[
                'Content-Disposition'] = 'attachment; filename=%s' % filename

        if invoice.type == Invoice.TYPE_INVOICE:
            if callable(INVOICE_TERMS):
                terms = INVOICE_TERMS(invoice)
            else:
                terms = [
                    Paragraph(term, styles['Terms']) for term in INVOICE_TERMS
                ]
        else:
            terms = []

        template = BriefTemplate()
        document = Document(
            sender=invoice.sender_lines,
            recipient=invoice.recipient_lines,
            date=date_format(invoice.created, 'SHORT_DATE_FORMAT'),
            content=[
                Paragraph(
                    '%s %s' %
                    (invoice.get_type_display() if not correction else
                     gettext(u'Correction of invoice'), invoice.number),
                    styles['Subject']),
                Spacer(template.CONTENT_WIDTH, 2 * mm),
                ItemTable(template, invoice),
                KeepTogether(TotalTable(template, invoice)),
                Spacer(template.CONTENT_WIDTH, 10 * mm),
            ] + terms)

        if settings.SHARK['INVOICE']['BACKGROUND']:
            with tempfile.TemporaryFile() as tmp:
                # Create content in a temporary file
                template.render(document, tmp)
                # Combine background with the content
                writer = PdfFileWriter()
                content = PdfFileReader(tmp)
                info_dict = writer._info.getObject()
                info_dict.update(content.getDocumentInfo())
                first_bg = PdfFileReader(
                    open(settings.SHARK['INVOICE']['BACKGROUND']['FIRST_PAGE'],
                         'rb'))
                later_bg = PdfFileReader(
                    open(settings.SHARK['INVOICE']['BACKGROUND']['LATER_PAGE'],
                         'rb'))
                bg = [first_bg.getPage(0), later_bg.getPage(0)]
                for i, page in enumerate(content.pages):
                    page.mergePage(bg[min(i, 1)])
                    page.compressContentStreams()
                    writer.addPage(page)
                writer.write(response)
        else:
            # Render content directly to the HTTP response object if no
            # background images are configured.
            template.render(document, response)

    return response