def extract_information(pdf_path): testread = "" with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() testread = pdf.getPage(92).extractText().strip() print(pdf.getPage(92).extractText().strip()) number_of_pages = pdf.getNumPages() # txt = f""" # Information about {pdf_path}: # Author: {information.author} # Creator: {information.creator} # Producer: {information.producer} # Subject: {information.subject} # Title: {information.title} # Number of pages: {number_of_pages} # """ print(testread) # define variables s = testread.strip() file = "file.mp3" # initialize tts, create mp3 and play tts = gTTS(s, 'en') tts.save(file) #os.system("mpg123 " + file) return information
def pdf_meta(tmp_file_path, original_file_name, original_file_extension): doc_info = None xmp_info = None if use_pdf_meta: with open(tmp_file_path, 'rb') as f: pdf_file = PdfFileReader(f) doc_info = pdf_file.getDocumentInfo() xmp_info = parse_xmp(pdf_file) if xmp_info: author = ' & '.join(split_authors(xmp_info['author'])) title = xmp_info['title'] subject = xmp_info['subject'] tags = xmp_info['tags'] languages = xmp_info['languages'] publisher = xmp_info['publisher'] else: author = u'Unknown' title = '' languages = [""] publisher = "" subject = "" tags = "" if doc_info: if author == '': author = ' & '.join(split_authors([doc_info.author])) if doc_info.author else u'Unknown' if title == '': title = doc_info.title if doc_info.title else original_file_name if subject == '': subject = doc_info.subject or "" if tags == '' and '/Keywords' in doc_info: if isinstance(doc_info['/Keywords'], bytes): tags = doc_info['/Keywords'].decode('utf-8') else: tags = doc_info['/Keywords'] else: title = original_file_name return BookMeta( file_path=tmp_file_path, extension=original_file_extension, title=title, author=author, cover=pdf_preview(tmp_file_path, original_file_name), description=subject, tags=tags, series="", series_id="", languages=','.join(languages), publisher=publisher, pubdate="", identifiers=[])
def pdf_parser(s): s = s.strip() # required to suppress warning messages with open(os.devnull, 'w') as fp: pdf = PdfFileReader(BytesIO(s), strict=False, warndest=fp) if pdf.isEncrypted: try: pdf.decrypt('') except NotImplementedError: return {} meta = pdf.getDocumentInfo() or {} #print(str(meta)) result = {} for key in meta.keys(): result[key[1:]] = meta.get(key) return result
def start(): from PyPDF3 import PdfFileReader import glob print("Put PDF file in pdfs/") print("Which PDF file would you like to read the meta data for?") for d in glob.iglob("pdfs/*"): if "emptyfile" not in d: print(d.replace("pdfs/")) ans = str(input("> ")) if ".pdf" in ans: pass else: ans = ans + ".pdf" pdffile = PdfFileReader(file=(ans, 'rb')) docInfo = pdffile.getDocumentInfo() for metaItem in docInfo: print("- " + metaItem + ":" + docInfo[metaItem]) print("\n")
def pdfHandler(file_dir): input1 = PdfFileReader(open(file_dir, 'rb')) print('document1.pdf has {} pages.'.format(str(input1.getNumPages()))) fields = input1.getFields() print(type(fields)) documentInfo = input1.getDocumentInfo() print(type(documentInfo)) if documentInfo is not None: for key in documentInfo.keys(): print('{} : {}'.format(key, documentInfo.get(key))) metaData = input1.getXmpMetadata() print(type(metaData)) if metaData is not None: # print(metaData) for relation in metaData.dc_relation: print('relation: {}'.format(relation))
def invoice_pdf(request, number, correction=False): invoice = get_object_or_404(Invoice, number=number) if correction: invoice = invoice.correction from reportlab.lib.units import mm from reportlab.platypus import Paragraph from reportlab.platypus.flowables import Spacer from reportlab.platypus.flowables import KeepTogether from dinbrief.document import Document from dinbrief.invoice import ItemTable, TotalTable from dinbrief.styles import styles from dinbrief.template import BriefTemplate with trans_override(invoice.language): response = HttpResponse(content_type='application/pdf') if 'download' in request.GET: filename = '%s.pdf' % invoice.number response[ 'Content-Disposition'] = 'attachment; filename=%s' % filename if invoice.type == Invoice.TYPE_INVOICE: if callable(INVOICE_TERMS): terms = INVOICE_TERMS(invoice) else: terms = [ Paragraph(term, styles['Terms']) for term in INVOICE_TERMS ] else: terms = [] template = BriefTemplate() document = Document( sender=invoice.sender_lines, recipient=invoice.recipient_lines, date=date_format(invoice.created, 'SHORT_DATE_FORMAT'), content=[ Paragraph( '%s %s' % (invoice.get_type_display() if not correction else gettext(u'Correction of invoice'), invoice.number), styles['Subject']), Spacer(template.CONTENT_WIDTH, 2 * mm), ItemTable(template, invoice), KeepTogether(TotalTable(template, invoice)), Spacer(template.CONTENT_WIDTH, 10 * mm), ] + terms) if settings.SHARK['INVOICE']['BACKGROUND']: with tempfile.TemporaryFile() as tmp: # Create content in a temporary file template.render(document, tmp) # Combine background with the content writer = PdfFileWriter() content = PdfFileReader(tmp) info_dict = writer._info.getObject() info_dict.update(content.getDocumentInfo()) first_bg = PdfFileReader( open(settings.SHARK['INVOICE']['BACKGROUND']['FIRST_PAGE'], 'rb')) later_bg = PdfFileReader( open(settings.SHARK['INVOICE']['BACKGROUND']['LATER_PAGE'], 'rb')) bg = [first_bg.getPage(0), later_bg.getPage(0)] for i, page in enumerate(content.pages): page.mergePage(bg[min(i, 1)]) page.compressContentStreams() writer.addPage(page) writer.write(response) else: # Render content directly to the HTTP response object if no # background images are configured. template.render(document, response) return response