Ejemplo n.º 1
0
def extract_information(pdf_path):
    testread = ""
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        testread = pdf.getPage(92).extractText().strip()
        print(pdf.getPage(92).extractText().strip())
        number_of_pages = pdf.getNumPages()

    # txt = f"""
    # Information about {pdf_path}:

    # Author: {information.author}
    # Creator: {information.creator}
    # Producer: {information.producer}
    # Subject: {information.subject}
    # Title: {information.title}
    # Number of pages: {number_of_pages}
    # """
    print(testread)

    # define variables
    s = testread.strip()
    file = "file.mp3"

    # initialize tts, create mp3 and play
    tts = gTTS(s, 'en')
    tts.save(file)
    #os.system("mpg123 " + file)

    return information
Ejemplo n.º 2
0
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    xmp_info = None

    if use_pdf_meta:
        with open(tmp_file_path, 'rb') as f:
            pdf_file = PdfFileReader(f)
            doc_info = pdf_file.getDocumentInfo()
            xmp_info = parse_xmp(pdf_file)

    if xmp_info:
        author = ' & '.join(split_authors(xmp_info['author']))
        title = xmp_info['title']
        subject = xmp_info['subject']
        tags = xmp_info['tags']
        languages = xmp_info['languages']
        publisher = xmp_info['publisher']
    else:
        author = u'Unknown'
        title = ''
        languages = [""]
        publisher = ""
        subject = ""
        tags = ""

    if doc_info:
        if author == '':
            author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown'
        if title == '':
            title = doc_info.title if doc_info.title else original_file_name
        if subject == '':
            subject = doc_info.subject or ""
        if tags == '' and '/Keywords' in doc_info:
            if isinstance(doc_info['/Keywords'], bytes):
                tags = doc_info['/Keywords'].decode('utf-8')
            else:
                tags = doc_info['/Keywords']
    else:
        title = original_file_name

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags=tags,
        series="",
        series_id="",
        languages=','.join(languages),
        publisher=publisher,
        pubdate="",
        identifiers=[])
Ejemplo n.º 3
0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(BytesIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo() or {}
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
Ejemplo n.º 4
0
def start():
    from PyPDF3 import PdfFileReader
    import glob
    print("Put PDF file in pdfs/")
    print("Which PDF file would you like to read the meta data for?")
    for d in glob.iglob("pdfs/*"):
        if "emptyfile" not in d:
            print(d.replace("pdfs/"))
    ans = str(input("> "))
    if ".pdf" in ans:
        pass
    else:
        ans = ans + ".pdf"
    pdffile = PdfFileReader(file=(ans, 'rb'))
    docInfo = pdffile.getDocumentInfo()
    for metaItem in docInfo:
        print("- " + metaItem + ":" + docInfo[metaItem])
    print("\n")
Ejemplo n.º 5
0
def pdfHandler(file_dir):
    input1 = PdfFileReader(open(file_dir, 'rb'))

    print('document1.pdf has {} pages.'.format(str(input1.getNumPages())))

    fields = input1.getFields()
    print(type(fields))

    documentInfo = input1.getDocumentInfo()
    print(type(documentInfo))
    if documentInfo is not None:
        for key in documentInfo.keys():
            print('{} : {}'.format(key, documentInfo.get(key)))

    metaData = input1.getXmpMetadata()
    print(type(metaData))
    if metaData is not None:
        # print(metaData)
        for relation in metaData.dc_relation:
            print('relation: {}'.format(relation))
Ejemplo n.º 6
0
def invoice_pdf(request, number, correction=False):
    invoice = get_object_or_404(Invoice, number=number)
    if correction:
        invoice = invoice.correction
    from reportlab.lib.units import mm
    from reportlab.platypus import Paragraph
    from reportlab.platypus.flowables import Spacer
    from reportlab.platypus.flowables import KeepTogether

    from dinbrief.document import Document
    from dinbrief.invoice import ItemTable, TotalTable
    from dinbrief.styles import styles
    from dinbrief.template import BriefTemplate

    with trans_override(invoice.language):

        response = HttpResponse(content_type='application/pdf')
        if 'download' in request.GET:
            filename = '%s.pdf' % invoice.number
            response[
                'Content-Disposition'] = 'attachment; filename=%s' % filename

        if invoice.type == Invoice.TYPE_INVOICE:
            if callable(INVOICE_TERMS):
                terms = INVOICE_TERMS(invoice)
            else:
                terms = [
                    Paragraph(term, styles['Terms']) for term in INVOICE_TERMS
                ]
        else:
            terms = []

        template = BriefTemplate()
        document = Document(
            sender=invoice.sender_lines,
            recipient=invoice.recipient_lines,
            date=date_format(invoice.created, 'SHORT_DATE_FORMAT'),
            content=[
                Paragraph(
                    '%s %s' %
                    (invoice.get_type_display() if not correction else
                     gettext(u'Correction of invoice'), invoice.number),
                    styles['Subject']),
                Spacer(template.CONTENT_WIDTH, 2 * mm),
                ItemTable(template, invoice),
                KeepTogether(TotalTable(template, invoice)),
                Spacer(template.CONTENT_WIDTH, 10 * mm),
            ] + terms)

        if settings.SHARK['INVOICE']['BACKGROUND']:
            with tempfile.TemporaryFile() as tmp:
                # Create content in a temporary file
                template.render(document, tmp)
                # Combine background with the content
                writer = PdfFileWriter()
                content = PdfFileReader(tmp)
                info_dict = writer._info.getObject()
                info_dict.update(content.getDocumentInfo())
                first_bg = PdfFileReader(
                    open(settings.SHARK['INVOICE']['BACKGROUND']['FIRST_PAGE'],
                         'rb'))
                later_bg = PdfFileReader(
                    open(settings.SHARK['INVOICE']['BACKGROUND']['LATER_PAGE'],
                         'rb'))
                bg = [first_bg.getPage(0), later_bg.getPage(0)]
                for i, page in enumerate(content.pages):
                    page.mergePage(bg[min(i, 1)])
                    page.compressContentStreams()
                    writer.addPage(page)
                writer.write(response)
        else:
            # Render content directly to the HTTP response object if no
            # background images are configured.
            template.render(document, response)

    return response