def main(): args = do_cmd_args_line() for f in os.listdir(args.path): if f.endswith('.pdf'): fname = os.path.join(args.path, f) pdfile = PdfFileReader(file(fname, 'rb')) title = pdfile.getDocumentInfo().title subject = pdfile.getDocumentInfo().subject author = pdfile.getDocumentInfo().author if author == None or author == '': author = 'Unknown' if title == None or title == '': title = os.path.splitext(f)[0] tgtfname = '[{0}] {1}.pdf'.format(author, title) ftgtname = os.path.join(args.dest, tgtfname) print 'renaming {0} -> {1}'.format(fname, ftgtname) if not args.dryrun: try: os.rename(fname, ftgtname) except Exception as e: print e
def get_pdf_title(pdf_file_path): # must be open as 'rb', otherwise will raise "PdfReadError: EOF marker not found" with open(pdf_file_path,'rb') as f: pdf_reader = PdfFileReader(f) # print(pdf_file_path) # print(pdf_reader.getDocumentInfo()) if '/Title' in pdf_reader.getDocumentInfo().keys(): return pdf_reader.getDocumentInfo()['/Title'] else: return None
def iter_pdf_page_text(self, filename): year="" month="" day="" mydate="" self.filename = filename reader = PdfFileReader(open(filename,"rb")) logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) metadata = reader.getDocumentInfo() logging.info("METADATA: " + str(metadata)) try: if metadata.has_key('/CreationDate'): year = metadata['/CreationDate'][2:5] month = metadata['/CreationDate'][6:7] day = metadata['/CreationDate'][8:9] mydate =year+"-"+month+"-"+day else: mydate = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S") except: #hack ... but sometimes /creationdate is bunged mydate = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text
def get_file_info(fn): """ Get the metadata stored in an image file returning ``None`` on failure. """ ext = os.path.splitext(fn)[1].lower() if ext == ".png": if Image is None: raise ImportError("PIL or pillow must be installed to read " "metadata from PNG files.") img = Image.open(fn) return img.info if ext == ".pdf": if PdfFileReader is None: raise ImportError("PyPDF2 must be installed to read " "metadata from PDF files.") with open(fn, "rb") as f: pdf = PdfFileReader(f) di = pdf.getDocumentInfo() if "/Keywords" not in di: return None try: return json.loads(di["/Keywords"]) except ValueError: return None return None
def _merge_pdf_images(self, docf, stream, outlines): pdfin = PdfFileReader(docf.name) pdfout = PdfFileWriter() pdfout._info.getObject().update(pdfin.getDocumentInfo()) # embed images into file for pageno, page in enumerate(pdfin.pages): for img in self._pdf_images: if img.page != (pageno + 1): continue # Load image imgin = PdfFileReader(img.fname) imgpage = imgin.getPage(0) scale = min(img.width / imgpage.mediaBox[2].as_numeric(), img.height / imgpage.mediaBox[3].as_numeric()) page.mergeScaledTranslatedPage(imgpage, scale, img.x, img.y) pdfout.addPage(page) # create outlines stack = [] for pageno, level, header in outlines: stack = stack[:level] parent = (stack[0] if stack else None) stack.append(pdfout.addBookmark(header.strip(), pageno - 1, parent)) pdfout.write(stream)
def __init__(self, file_abs_path): """ __init__(self, file_abs_path): Arguments: - file_abs_path: (string) Absolute file path. """ self.absolute_path = file_abs_path self.name = os.path.basename(self.absolute_path) application_messages.print_file_name(self.name) application_messages.print_document_info('Path', self.absolute_path) try: document = PdfFileReader(file(self.absolute_path, 'rb')) self.__get_encrypted_status(document) document_info = document.getDocumentInfo() if document_info: self.__parse_document_info(document_info) except Exception as ex: if 'encode' not in str(ex): raise Exception(ex)
def pdf_meta(tmp_file_path, original_file_name, original_file_extension): if (use_pdf_meta): pdf = PdfFileReader(open(tmp_file_path, 'rb')) doc_info = pdf.getDocumentInfo() else: doc_info = None if (doc_info is not None): author = doc_info.author if doc_info.author is not None else "Unknown" title = doc_info.title if doc_info.title is not None else original_file_name subject = doc_info.subject else: author = "Unknown" title = original_file_name subject = "" return uploader.BookMeta( file_path = tmp_file_path, extension = original_file_extension, title = title, author = author, cover = pdf_preview(tmp_file_path, original_file_name), description = subject, tags = "", series = "", series_id="")
def check_file_for_processing(self, ev_path): """ This checks a path to see if it we should process it. :param ev_path: Fully qualified path to file to check :return: True if it should be convertred. False if not """ if not ev_path.endswith(".pdf"): return False if ev_path.endswith("_ocr.pdf"): return False if self.archive_suffix and ev_path.endswith(self.archive_suffix): return False try: with open(ev_path, "rb") as f: pdf = PdfFileReader(f) pdf_info = pdf.getDocumentInfo() # It has been OCR'ed' if pdf_info is not None and '/PyPDFOCR' in pdf_info: return False except IOError: return False except PdfReadError: return False return True
def test_backlog_list(self): user = factories.UserFactory.create( email='*****@*****.**', password='******') backlog = factories.create_project_sample_backlog(user) for i in range(0, 10): factories.create_sample_story(user, backlog=backlog) # special printing of -1 points story = factories.UserStory.objects.all()[0] story.points = -1 story.save() url = reverse("print_stories") url_plus = "{0}?backlog_id={1}".format(url, backlog.pk) self.app.get(url_plus, status=302) response = self.app.get(url_plus, user=user) form = response.forms['print_pdf_form'] for k, f in form.fields.items(): if k and "story-" in k: form[k] = True form['print-side'] = "long" form['print-format'] = "a4" response = form.submit() self.assertEqual(response['Content-Type'], "application/pdf") o = StringIO.StringIO(response.content) pdf = PdfFileReader(o) info = pdf.getDocumentInfo() self.assertEqual(pdf.getNumPages(), 6) self.assertEqual("backlogman.com", info['/Author']) # A4 is not "round" in PDF unit format real value are # approximately : [0, 0, 841.88980, 595.27560] self.assertEqual([0, 0, 841, 595], [int(x) for x in pdf.getPage(0)["/MediaBox"]]) response = self.app.get(url_plus, user=user) form = response.forms['print_pdf_form'] for k, f in form.fields.items(): if k and "story-" in k: form[k] = True form['print-side'] = "short" form['print-format'] = "letter" response = form.submit() self.assertEqual(response['Content-Type'], "application/pdf") o = StringIO.StringIO(response.content) pdf = PdfFileReader(o) info = pdf.getDocumentInfo() self.assertEqual(pdf.getNumPages(), 6) self.assertEqual("backlogman.com", info['/Author']) self.assertEqual([0, 0, 792, 612], pdf.getPage(0)["/MediaBox"])
def getContentInformation(self): """Returns the information about the PDF document with pdfinfo. """ if not self.hasData(): return dict() try: return self._content_information.copy() except AttributeError: pass tmp = tempfile.NamedTemporaryFile() tmp.write(self.getData()) tmp.seek(0) command_result = None try: # First, we use pdfinfo to get standard metadata command = ['pdfinfo', '-meta', '-box', tmp.name] try: command_result = Popen(command, stdout=PIPE).communicate()[0] except OSError, e: if e.errno == errno.ENOENT: raise ConversionError('pdfinfo was not found') raise result = {} for line in command_result.splitlines(): item_list = line.split(':') key = item_list[0].strip() value = ':'.join(item_list[1:]).strip() result[key] = value # Then we use PyPDF2 to get extra metadata try: from PyPDF2 import PdfFileReader from PyPDF2.utils import PdfReadError except ImportError: # if PyPDF2 not found, pass pass else: try: pdf_file = PdfFileReader(tmp) for info_key, info_value in (pdf_file.getDocumentInfo() or {}).iteritems(): info_key = info_key.lstrip("/") if isinstance(info_value, unicode): info_value = info_value.encode("utf-8") # Ignore values that cannot be pickled ( such as AAPL:Keywords ) try: pickle.dumps(info_value) except pickle.PicklingError: LOG("PDFDocument.getContentInformation", INFO, "Ignoring non picklable document info on %s: %s (%r)" % ( self.getRelativeUrl(), info_key, info_value)) else: result.setdefault(info_key, info_value) except PdfReadError: LOG("PDFDocument.getContentInformation", PROBLEM, "PyPDF2 is Unable to read PDF, probably corrupted PDF here : %s" % \ (self.getRelativeUrl(),))
def get_title(path): pdf = PdfFileReader(open(path, "rb")) metadata = pdf.getDocumentInfo() if metadata is not None: if metadata.title is not None: return str.strip(metadata.title) return ""
def extract_title_pdf(filename): try: with open(filename, 'rb') as f: p = PdfFileReader(f) info = p.getDocumentInfo() title = info.get('/Title') except IOError: title = None return title
def test_docs_contents(self, cnf_about_pg): '''Test contents of each document''' # # Disabling for https://bugzilla.redhat.com/show_bug.cgi?id=1026943 # for link in cnf_about_pg.docs_links: doc = requests.get(link['text_url'], verify=False) pdf = PdfFileReader(StringIO.StringIO(doc.content)) pdf_info = pdf.getDocumentInfo()
def getMetadataField(self, pdf_filename, field_name): with open(pdf_filename, 'rb') as file_input: input_f = PdfFileReader(file_input) document_info = input_f.getDocumentInfo() key = '/' + field_name if key in document_info.keys(): return document_info[key] else: return None
def printMeta(): for dirpath, dirnames, files in os.walk("doc_pdf"): for name in files: ext = name.lower().rsplit('.', 1)[-1] if ext in ['pdf']: print "[+] Metadata for file: %s " %(dirpath+os.path.sep+name) pdfFile = PdfFileReader(file(dirpath+os.path.sep+name, 'rb')) docInfo = pdfFile.getDocumentInfo() for metaItem in docInfo: print '[+] ' + metaItem + ':' + docInfo[metaItem]
def getPyPDF2Info(self): pdf_file = PdfFileReader(open(self.dir_downloads+self.filename+'.pdf', 'rb')) pdf_info = pdf_file.getDocumentInfo() self.author = pdf_info.author self.title = pdf_info.title self.subject = pdf_info.subject pdf_meta = pdf_file.getXmpMetadata() self.year = str(pdf_meta.xmp_createDate)
def get_author(path): """ :type path: """ pdf = PdfFileReader(open(path, "rb")) metadata = pdf.getDocumentInfo() if metadata is not None: return metadata.author
def get_pdf_metadata(fn): meta = dict() try: with open(os.path.abspath(fn), "rb") as pdf_file: pdf = PdfFileReader(pdf_file) # if pdf.isEncrypted: # pdf.decrypt('') meta['author'] = pdf.getDocumentInfo().author meta['creator'] = pdf.getDocumentInfo().creator meta['producer'] = pdf.getDocumentInfo().producer meta['subject'] = pdf.getDocumentInfo().subject meta['title'] = pdf.getDocumentInfo().title except Exception as e: print e print 'file: %s' % fn return meta
def test_contents(guides, soft_assert): """Test contents of each document.""" pytest.sel.force_navigate("about") for link in guides: locator = getattr(about, link) url = pytest.sel.get_attribute(locator, "href") data = requests.get(url, verify=False) pdf = PdfFileReader(StringIO(data.content)) pdf_info = pdf.getDocumentInfo() soft_assert("CloudForms" in pdf_info["/Title"], "CloudForms is not in the title!") soft_assert(pytest.sel.text(locator) in pdf_info["/Title"], "{} not in {}".format( pytest.sel.text(locator), pdf_info["/Title"]))
def get_info_for_file(filepath): out = [] reader = PdfFileReader(open(filepath, 'rb')) try: info = reader.getDocumentInfo() for key in info: if info[key]: out.append((key, info[key])) except: pass return out
def process(self, content, mimetype='application/pdf'): """Process a PDF document. Args: content: Binary content of the document. mimetype: Id of MIME type (content ignored if it isn't `application/pdf`). Returns: Tuple: Relevancy of the document (based on keywords) Metadata extracted from the document (dictionary). """ relevancy = 0 metadata = {} if mimetype == 'application/pdf': # Obtain metadata doc = PdfFileReader(BytesIO(content)) info = doc.getDocumentInfo() if info: for k in info: metadata[k] = info.getText(k) # Extra metadata metadata['_num_pages'] = doc.getNumPages() # Process title, subject and metadata keywords # TODO guess title from page text when not provided if self.keywords: relevant = (metadata.get('/Title', '') + ' ' + metadata.get('/Subject', '') + ' ' + metadata.get('/Keywords', '')).lower() for word in self.keywords: if word.lower() in relevant: # Each relevant keyword increases relevancy in 10 points relevancy += 10 # Process pages. distance_factor = 1 for p in range(doc.getNumPages()): # Break if factor is too low if distance_factor < 0.01: break try: text = doc.getPage(p).extractText().lower() for word in self.keywords: relevancy += distance_factor * text.count(word.lower()) except Exception as ex: # Some bad formed PDFs raise decoding errors. Skip page. pass # Each new page reduces relevancy factor in a half distance_factor /= 2 # Relevancy is significant by the nearest tenth relevancy = round(relevancy, 1) else: relevancy = 0 metadata['_relevancy'] = relevancy return relevancy, metadata
def getMetacsv(folder): csvfile = open(METADATA,'w') csvwriter = csv.writer(csvfile, dialect = 'excel') csvwriter.writerow(['FILENAME','Author', 'Company','Producer','Title','Creator','Creation Date','Modified Date','Subject','Keywords']) for filename in os.listdir(folder): metadata=[] try: if '.pdf' in filename: pdfFile = PdfFileReader(file(folder+'/'+filename, 'rb')) docInfo = pdfFile.getDocumentInfo() if '/Author' in docInfo: metadata.append(docInfo['/Author'].strip()) else: metadata.append('') if '/Company' in docInfo: metadata.append(docInfo['/Company'].strip()) else: metadata.append('') if '/Producer' in docInfo: metadata.append(docInfo['/Producer'].strip()) else: metadata.append('') if '/Title' in docInfo: metadata.append(docInfo['/Title'].strip()) else: metadata.append('') if '/Creator' in docInfo: metadata.append(docInfo['/Creator'].strip()) else: metadata.append('') if '/CreationDate' in docInfo: metadata.append(docInfo['/CreationDate'].strip()) else: metadata.append('') if '/ModDate' in docInfo: metadata.append(docInfo['/ModDate'].strip()) else: metadata.append('') if '/Subject' in docInfo: metadata.append(docInfo['/Subject'].strip()) else: metadata.append('') if '/Keywords' in docInfo: metadata.append(docInfo['/Keywords'].strip()) else: metadata.append('') csvwriter.writerow([filename]+metadata)
def printMetaData(self): for dirpath, dirnames, files in os.walk("pdfs"): try: for name in files: ext = name.lower().rsplit('.', 1)[-1] if ext in ['pdf']: print "[+] Metadata for file: %s " %(dirpath+os.path.sep+name) pdfFile = PdfFileReader(file(dirpath+os.path.sep+name, 'rb')) docInfo = pdfFile.getDocumentInfo() for metaItem in docInfo: print '[+] ' + metaItem + ':' + docInfo[metaItem] print "\n" except Exception,e: print "Error to Obtain PDF METADATA" pass
def printMeta(fileName): """ Abre e recupera informacoes do pdf """ # Abre o pdf pdfFile = PdfFileReader(open(fileName, 'rb')) # Pega as informacoes do pdf docInfo = pdfFile.getDocumentInfo() print('[*] PDF MetaData For: %s' % str(fileName)) # Se existir informacoes retorna if docInfo: # Loopa as informacoes do pdf for metaItem in docInfo: print('[+] %s:%s' % (metaItem, docInfo[metaItem])) else: print('[+] No Document Info')
def pdf_parser(s): s = s.strip() # required to suppress warning messages with open(os.devnull, 'w') as fp: pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp) if pdf.isEncrypted: try: pdf.decrypt('') except NotImplementedError: return {} meta = pdf.getDocumentInfo() #print(str(meta)) result = {} for key in meta.keys(): result[key[1:]] = meta.get(key) return result
def get_metadata_from_file(x): """Get metadata for file x. Returns a tuple (author, title)""" pdf_file = open(x, 'rb') pdfobj = PdfFileReader(pdf_file) dictinfo = pdfobj.getDocumentInfo() try: author = dictinfo['/Author'] except: author = None try: title = dictinfo['/Title'] except: title = None pdf_file.close() return (author, title)
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files): """ Get_info(file_path) Opens the pdf file for reading. Args: - file_path: (string) Absolute file path. - plain_log: (None | string) Log file in plain text. - csv_log: (None | string) Log file in csv format. """ file_name = os.path.basename(file_path) file_size = os.path.getsize(file_path) encrypted = 'No' try: # Try to open not password encrypted pdf files and pdf files # encrypted with a blank password. pdf_file = PdfFileReader(file(file_path, 'rb')) if pdf_file.getIsEncrypted() is True: dec_res = pdf_file.decrypt('') if dec_res == 1: encrypted = 'Yes' #Get and parse metadata doc_info = pdf_file.getDocumentInfo() title, author, creator, subject, producer, c_date, m_date \ = __Parse_doc_info(doc_info) num_pages = pdf_file.getNumPages() #Group info pdf_meta = pdf_metadata(file_name, title, author, creator, subject, producer, c_date, m_date, encrypted, num_pages, file_size) __Print_metadata(pdf_meta) if plain_log: Log(file_name, pdf_meta, plain_log, 'txt') if csv_log: Log(file_name, pdf_meta, f_log_csv, 'csv') analyzed_files = analyzed_files + 1 except Exception, e: error = file_name + ' ' + str(e) __Print_error(error)
def extract_creation_date(filename): # Add strict=False in order to avoid 'PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]' pdf_toread = PdfFileReader(open(filename, "rb"), strict=False) # "file has not been decrypted" error https://github.com/mstamy2/PyPDF2/issues/51 if pdf_toread.isEncrypted: pdf_toread.decrypt('') pdf_info = pdf_toread.getDocumentInfo() #print(str(pdf_info)) # PDF Reference, 3.8.3 Dates, http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf # A date is an ASCII string of the form (D:YYYYMMDDHHmmSSOHH'mm') # Examle: D:20170508085336+02'00' raw_date = pdf_info['/CreationDate'] #print(str(raw_date)) date_str = re.search('^D:(\d{14})', raw_date).group(1) #print(str(date_str)) timestamp = datetime.strptime(date_str, "%Y%m%d%H%M%S") #print(str(date)) return timestamp
def run(self): """ Thread that waits on a queue and rips metadata from the pdfs and updates their entries in rethinkdb """ while True: docid = self.in_queue.get() document = self.index.get({'id': docid}) pdf = PdfFileReader(file(document['path'], 'rb')) doc_info = pdf.getDocumentInfo() pdf_info = { 'title': doc_info.title or '', 'author': doc_info.author or '', 'creator': doc_info.creator or '', 'producer': doc_info.producer or '' } r.table('documents').filter( {'id': docid}).update( {'pdfinfo': pdf_info}).run(self.index.rdb)
def get_file_info(fn): """ Get the metadata stored in an image file returning ``None`` on failure. """ ext = os.path.splitext(fn)[1].lower() if ext == ".png": img = Image.open(fn) return img.info if ext == ".pdf": with open(fn, "rb") as f: pdf = PdfFileReader(f) di = pdf.getDocumentInfo() if "/Keywords" not in di: return None try: return json.loads(di["/Keywords"]) except ValueError: return None return None
def get_info_pdf(filename): # 打开文件 file_stream = open(filename, 'rb') # 创建一个实例用来读取pdf文件 pdf_reader = PdfFileReader(file_stream) # 获取pdf文件的信息 document_info = pdf_reader.getDocumentInfo() # 获取pdf文件的总页数 pdf_page_nums = pdf_reader.getNumPages() # 获取单页pdf文件数据,得到一个PageObject对象 single_page = pdf_reader.getPage(1) # 获取页面布局 pdf_layout = pdf_reader.getPageLayout() # 检索指定PageObject的页码 page_num = pdf_reader.getPageNumber(single_page)
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value = (metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta == "/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array)
def get_info(path): with open('Bank Balance Statement.pdf', 'rb') as f: pdf = PdfFileReader(f) info = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() print(info) #author = info.author #creator = info.creator #producer = info.producer #ubject = info.subject title = info.title
def printMeta( ruta ): # funcion que obtiene los metadatos de archivos pdf en un directorio for dirpath, dirnames, files in os.walk( ruta): # para el diretorio, nombre y archivos en la carpeta docs for name in files: #recorremos los posibles fichreos ext = name.lower().rsplit('.', 1)[-1] if ext in ['pdf']: print chr(27) + "[0;31m" + "[+] Metadata for file: %s " % ( dirpath + os.path.sep + name ) + chr( 27 ) + "[0m" # pintamos el titulo de metadata for file y el directorio y nombre del documento pdfFile = PdfFileReader( file(dirpath + os.path.sep + name, 'rb')) # abrimos el fichero docInfo = pdfFile.getDocumentInfo( ) # creamos un diccionario con la info recolectada for metaItem in docInfo: print '[+] ' + metaItem + ':' + docInfo[metaItem] print "\n"
def extract_information(pdf_path): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) pdfReader = PyPDF2.PdfFileReader(pdf_path) information = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() page = pdfReader.getPage(0) txt = f """ Information about {pdf_path}: Author: {information.author} Creator: {information.creator} Producer: {information.producer} Subject: {information.subject} Title: {information.title} Number of pages: {number_of_pages} """ print(page) return information
def info(self): """Get maetadata information about PDF """ meta_info_data = {} with open(self.filepath, 'rb') as file: # initialize the PDF reader object reader = PdfFileReader(file) if reader.isEncrypted: reader.decrypt(self.password) # Retrieves the PDF file's document information dictionary info = reader.getDocumentInfo() # Retrives XMP (Extensible Metadata Platform) data from the PDF document xmp = reader.getXmpMetadata() # Number of pages in PDF num_of_pages = reader.getNumPages() if info is not None: info_key_lst = ['filepath', 'author', 'creator', 'producer', 'subject', 'title', 'number_of_pages'] info_value_list = [self.filepath, info.author, info.creator, info.producer, info.subject, info.title, num_of_pages] for info_key, info_value in zip(info_key_lst, info_value_list): meta_info_data[info_key] = info_value if xmp is not None: xmp_key_lst = ['format', 'createDate', 'modifyDate', 'metadataDate', 'creatorTool'] xmp_value_lst = [xmp.dc_format, xmp.xmp_createDate, xmp.xmp_modifyDate, xmp.xmp_metadataDate, xmp.xmp_creatorTool] for xmp_key, xmp_value in zip(xmp_key_lst, xmp_value_lst): if isinstance(xmp_value, datetime): meta_info_data[xmp_key] = '{} {}'.format(xmp_value.date(), xmp_value.time()) else: meta_info_data[xmp_key] = xmp_value return meta_info_data return meta_info_data
async def main(): forms: Dict[str, Form] = await get_forms() for task_code in task1_data: parsed_task_code: str = parse_task_code(task_code) if parsed_task_code not in forms: print( json.dumps({ 'error': True, 'message': f'task code "{parsed_task_code}" does not exist', })) else: async with ClientSession() as PdfPage.session: pdf_page: PdfPage = PdfPage(forms[parsed_task_code].url()) content: bytes = await pdf_page.content() form_dict: Dict[str, Any] = forms[parsed_task_code].to_dict() form_dict['form_title'] = '' form_dict['form_number'] = task_code if not content: form_dict['error_message'] = 'could not fetch PDF content' print(json.dumps(form_dict)) else: try: with warnings.catch_warnings(): warnings.simplefilter("ignore") pdf = PdfFileReader(BytesIO(content)) info = pdf.getDocumentInfo() try: subject: str = info['/Subject'] form_dict['form_title'] = subject except: form_dict[ 'error_message'] = 'PDF error, /Subject does not exist' except: form_dict['error_message'] = 'PDF reading error' _ = form_dict.pop('code') _ = form_dict.pop('url') print(json.dumps(form_dict)) # PRINT RESULT
def get_metadata(path): with open(path, 'rb') as f: pdf = PdfFileReader(f, strict=False) info = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() if (info != None): author = info.author creator = info.creator producer = info.producer subject = info.subject title = info.title else: # print('Metadata: None') info = {} info.setdefault('/Author', '') # for key in info: # print (key, ":", info[key]) # print(info) return (info, number_of_pages)
def extract_metadata(self, file_path): with open(file_path, 'rb') as fh: pdf = PdfFileReader(fh, strict=False) meta = pdf.getDocumentInfo() if meta is not None: self.update('title', meta.title) self.update('author', meta.author) self.update('generator', meta.creator) self.update('generator', meta.producer) if meta.subject: self.result.keywords.append(meta.subject) xmp = pdf.getXmpMetadata() if xmp is not None: self.update('id', xmp.xmpmm_documentId) for lang, title in xmp.dc_title.items(): self.update('title', title) self.result.languages.append(lang) self.update('generator', xmp.pdf_producer) self.update('created_at', xmp.xmp_createDate) self.update('modified_at', xmp.xmp_modifyDate) self.result.languages.extend(xmp.dc_language)
def get_info(path): with open(path, 'rb') as f: pages = convert_from_path(path, 200) image_counter = 1 for page in pages: filename = "page_" + str(image_counter) + ".jpg" page.save(filename, 'JPEG') image_counter = image_counter + 1 filelimit = image_counter - 1 for i in range(1, filelimit + 1): filename = "page_" + str(i) + ".jpg" text = str(((pytesseract.image_to_string(Image.open(filename))))) text = text.replace('-\n', '') final_text = [] final_text.append(text) finaltext = "\n".join(final_text) pdf = PdfFileReader(f) info = pdf.getDocumentInfo() return ([ path, finaltext, info.author, info.creator, info.producer, info.subject, info.title ])
def extract_data(request): if request.FILES: file = request.FILES['file'] input = PdfFileReader(file) if input.isEncrypted: input.decrypt('') info = input.getDocumentInfo() title = info.title author = info.author d = Data.objects.create(title=title, author=author) d.save() response = {'status': 1, 'message': 'Data saved'} return HttpResponse(response, content_type='application/json') else: output = '' return HttpResponse(output)
def saveDocumentInfos(self, pdfDocument: PdfFileReader, fileDescriptor: FileDescriptor, dbQuerier: DbQuerier) -> None: documentMetas = self.mapDocumentInfos(pdfDocument.getDocumentInfo()) def getMeta(key: str) -> Any: if key in documentMetas: return documentMetas[key] else: return None # Saving document documentWordCount, documentCharacterCount = self.getDocumentWordAndCharacterCount( pdfDocument) documentEntity = dbQuerier.getFileDocumentEntity( fileDescriptor=fileDescriptor, title=getMeta('title'), author=getMeta('author'), pageCount=pdfDocument.getNumPages(), wordCount=documentWordCount, characterCount=documentCharacterCount) # Saving document chapters, TODO # Saving document metas for documentMetaName, documentMetaValue in documentMetas.items(): metaNameEntity = dbQuerier.getMetaNameEntity(documentMetaName) if isinstance(documentMetaValue, list) or isinstance( documentMetaValue, set): for _documentMetaValue in documentMetaValue: dbQuerier.getMetaValueEntity(documentEntity, metaNameEntity, _documentMetaValue) else: dbQuerier.getMetaValueEntity(documentEntity, metaNameEntity, documentMetaValue)
def _search(keyword, metaLocat, archive): """Assisting function to searchFile(). Searches through given archive and returns a list of the file(s) that match the keyword in the given metadata location.""" global progLocation retList = [] tempFolderPath = os.path.join( progLocation, "_tempFolder") # Create temporary folder to extract zip to if os.path.exists(tempFolderPath): creTime = int(os.path.getctime(tempFolderPath)) curTime = int(time.time()) if curTime - creTime > 180: shutil.rmtree(os.path.join(tempFolderPath)) os.makedirs(tempFolderPath) archive.extractall(tempFolderPath) else: os.makedirs(tempFolderPath) archive.extractall(tempFolderPath) count = 0 for fileName in os.listdir(tempFolderPath): count += 1 #print(count) if fileName.endswith(".pdf"): openPDF = PdfFileReader( open(os.path.join(tempFolderPath, fileName), "rb")) infoDict = openPDF.getDocumentInfo() #print(infoDict) if keyword.lower() in infoDict[metaLocat].lower(): retList.append(fileName) return retList
def main(): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('Making pdf_files.json from base pdf files') with ZipFile(DOCS_PATH) as myzip: # List files inside zip filenames = list(map(lambda x: x.filename, filter(lambda x: not x.is_dir(), myzip.infolist()))) pdf_dict = defaultdict(dict) for file in filenames: logger.info(f"Processing {file}...") try: pdfReader = PdfFileReader(BytesIO(myzip.read(file))) # read file except Exception as e: # In case the file is corrupted logger.warning(e) logger.info(f"Attempting to recover {file}...") pdfReader = file_recovery(file, myzip) # attempting to recover file # doc_dict holds the attributes of each pdf file doc_dict = {i[1:]: str(j) for i, j in pdfReader.getDocumentInfo().items()} doc_dict["Country"] = file.split("/")[0] doc_dict["Text"] = "" for page in range(pdfReader.numPages): try: page_text = pdfReader.getPage(page).extractText() # extracting pdf text except TypeError as e: logger.warning(e) logger.info(f"Skipping {file}...") continue # doc_dict["Text"] = # break page_text = text_cleaning(page_text) # clean pdf text doc_dict["Text"] += page_text pdf_dict[os.path.splitext(os.path.basename(file))[0]] = doc_dict with open(os.path.join(INTER_PATH, 'pdf_files.json'), 'w') as outfile: json.dump(pdf_dict, outfile, ensure_ascii=False, indent=4)
def pdfMetaData(file_path, save=True): '''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)''' pdf_doc = PdfFileReader(open(file_path, "rb")) if pdf_doc.isEncrypted: try: if pdf_doc.decrypt("") != 1: sys.exit("target pdf document is encrypted... exiting...") except: sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...") doc_info = pdf_doc.getDocumentInfo() stats = os.stat(file_path) now = dt.now() file_name = getFileName(file_path) metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month, now.day, now.hour, now.minute, now.second, file_name[:-4]) try: for md in doc_info: metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n" except TypeError: sys.exit("Couldn't read document info! Make sure target is a valid pdf document...") metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime), dt.fromtimestamp(stats.st_mtime), dt.fromtimestamp(stats.st_atime), stats.st_uid) try: print(metadata) except UnicodeEncodeError: print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.") if save: file_name = getFileName(file_path) tgt = file_name + ".txt" saveResult(tgt, metadata)
def extract_info(document: Document): if document.filename is None: get_filename(document) if document.is_pdf: with open(document.path, 'rb') as f: pdf = PdfFileReader(f, strict=False) # TODO: Handle encrypted files document.num_pages = pdf.getNumPages() informations = pdf.getDocumentInfo() if informations is not None: document.info.author = "unknown" if not informations.author else informations.author document.info.creator = "unknown" if not informations.creator else informations.creator document.info.producer = "unknown" if not informations.producer else informations.producer document.info.subject = "unknown" if not informations.subject else informations.subject document.info.title = "unknown" if not informations.title else informations.title else: document.num_pages = 1 document.info.author = "unknown" document.info.creator = "unknown" document.info.producer = "unknown" document.info.subject = "unknown" document.info.title = "unknown"
def get_pdf_exif(self, pdf_file): Logger.printMessage( message='{methodName}'.format(methodName='get_pdf_exif'), description=pdf_file, debug_module=True) info = '' data = {} try: with open(pdf_file, 'rb') as f: pdf = PdfFileReader(f) info = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() if info: for a in info: data[a] = info[a] return data except Exception as e: Logger.printMessage( message='{methodName}'.format(methodName='exception'), description=e, debug_module=True) return e return -1
def print_pdf(file_full_path, color_mode): """Analyzes the metadata of a .pdf file""" # Header with file path if color_mode: cprint("\n[+] Metadata for file: %s" % (file_full_path), "green", attrs=["bold"]) else: print "\n[+] Metadata for file: %s" % (file_full_path) # Open the file try: pdf_file = PdfFileReader(file(file_full_path, "rb")) except: if color_mode: cprint("Could not read this file. Sorry!", "red") else: print "Could not read this file. Sorry!" return if pdf_file.isEncrypted: # Temporary workaround, pdf encrypted with no pass try: pdf_file.decrypt('') except: if color_mode: cprint("\tCould not decrypt this file. Sorry!", "red") else: print "\tCould not decrypt this file. Sorry!" return # Data structure with document information pdf_info = pdf_file.getDocumentInfo() # Print metadata if pdf_info: for metaItem in pdf_info: try: if color_mode: cprint("\t-" + metaItem[1:] + ": ", "cyan", end="") cprint(pdf_info[metaItem]) else: print "\t-" + metaItem[1:] + ": " + pdf_info[metaItem] except TypeError: if color_mode: cprint("\t-" + metaItem[1:] + ": " + "Error - Item not readable", "red") else: print "\t-" + metaItem[1:] + ": " + "Error - Item not readable" else: if color_mode: cprint("\t No data found", "red") else: print "\t No data found" print ""
def add_completed_pdf_files(self, azubi_name, year, completed_pdf_folder, pdfbanner_grid): """ Add completed PDF files to a scroll view Search every passed folder for completed PDF files of a given year and add for every completion a specific banner to the pdfbanner_grid. Parameters ---------- azubi_name : str The name of the trainee year : str The apprenticeship year completed_pdf_folder : list All files from a folder which contains completed pdf files pdfbanner_grid : GridLayout Used to add Pdfbanner, which is a GridLayout, inside the pdfbanner_grid (is a child of the ScrollView Class) """ self.root.ids['finalizedbanner_screen'].ids['title'].text = \ f"Abgezeichnete Berichte - {year}. Lehrjahr" for pdf in completed_pdf_folder: with open( os.path.join(os.getcwd(), 'Azubis', azubi_name, f'Nachweise_{year}', pdf), 'rb') as f: p = PdfFileReader(f) information = p.getDocumentInfo() c_w = information['/Calendar_week'] f_num = pdf[-6:-4] if pdf[-6].isdigit() else pdf[-5] b = (PdfBanner(azubi_name, int(f_num), int(year), c_w_number=int(c_w))) pdfbanner_grid.add_widget(b)
def pdfMetaData(file_path, pwd="", save=True): pdf_doc = PdfFileReader(open(file_path, "rb")) if pdf_doc.isEncrypted: if pdf_doc.decrypt(pwd) == 0: sys.exit("target pdf document is encrypted... exiting...") doc_info = pdf_doc.getDocumentInfo() stats = os.stat(file_path) metadata = "" for md in doc_info: metadata += str(md[1:]) + " : " + pretifyPyPDF2Time( str(md[1:]), str(doc_info[md])) + "\n" metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" % ( dt.fromtimestamp(stats.st_ctime), dt.fromtimestamp( stats.st_mtime), dt.fromtimestamp(stats.st_atime), stats.st_uid) print(metadata) if save: file_name = getFileName(file_path) tgt = file_name + ".txt" saveResult(tgt, metadata)
def extractData(extension, location): meta_data = {'location': location, 'extension': extension} meta_data['location'] = meta_data['location'].split('\\')[-1] if extension == 'pdf': #print(location) with open(location, 'rb') as f: pdf_to_get = PdfFileReader(f) file_info = pdf_to_get.getDocumentInfo() print(file_info) meta_data['author'] = file_info[ '/Author'] if '/Author' in file_info else '' #meta_data['bookname']=file_info['/Title'] if ('/Title' in file_info and file_info['/Title'] is not "u''") else os.path.basename(f.name).split('.')[0] meta_data['bookname'] = os.path.basename(f.name).split('.')[0] print(meta_data['bookname']) if extension == 'docx' or extension == 'docs' or extension == 'doc': with open(location, 'rb') as f: zf = zipfile.ZipFile(location) doc = lxml.etree.fromstring(zf.read('docProps/core.xml')) ns = {'dc': 'http://purl.org/dc/elements/1.1/'} if doc.xpath('//dc:creator', namespaces=ns)[0].text: meta_data['author'] = doc.xpath('//dc:creator', namespaces=ns)[0].text else: meta_data['author'] = '' if doc.xpath('//dc:title', namespaces=ns)[0].text: meta_data['bookname'] = doc.xpath( '//dc:title', namespaces=ns)[0].text if doc.xpath( '//dc:title', namespaces=ns)[0].text else '' else: meta_data['bookname'] = os.path.basename( meta_data['location']).split('.')[0] ########################## --------------- ADD CODE FOR PPTX Files To EXTRACT DATA ########################## --------------- ALSO, SEARCH FOR META-DATA FILE WHEN ADDING DATA createOrUpdateBook(meta_data)
def preprocess_pdf(pdf_name, document): # import pdb;pdb.set_trace() now = datetime.now() name = pdf_name.split('.')[0].split('/')[-1] path = os.getcwd() + '/media/' + 'CVS/{0}/{1}/{2}/'.format( now.year, '0' + str(now.month) if now.month < 10 else now.month, now.day) + name pdf_path = path + '.pdf' # f = open(document.read(), 'rb') pdf_copy = copy(document._file) pdf = PdfFileReader(document._file.file) # f.close() information = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() try: os.mkdir(path) except OSError: #print ("Creation of the directory %s failed" % path) pass else: #print ("Successfully created the directory %s " % path) pass convert_pdf_to_images(pdf_name, path, name, pdf_copy)
class PdfHandler(object): def __init__(self, pdfFile): self.openFile(pdfFile) def openFile(self, pdfFile): self.pdfObj = open(pdfFile, 'rb') self.pdf = PdfFileReader(self.pdfObj) self.info = self.pdf.getDocumentInfo() self.number_of_pages = self.pdf.getNumPages() def closePdf(self): self.pdfObj.close() def getNumberOfPages(self): return self.pdf.getNumPages() def getPdfMetadata(self): metadata = {} metadata["number_of_pages"] = self.number_of_pages metadata["info"] = {} metadata["info"]["default"] = self.info metadata["info"]['producer'] = self.info.producer metadata["info"]['author'] = self.info.author metadata["info"]['creator'] = self.info.creator return metadata def getText(self): text = "" text = self.pdf.getPage(4).extractText() # for pageNumber in range(1, self.number_of_pages): # text = text + " " + self.pdf.getPage(pageNumber).extractText() return text
def main(): # 获取一个PdfFileReader对象 pdf_input = PdfFileReader(open('mdb.pdf', 'rb')) pge_num = pdf_input.getNumPages() # 得到页码数 print(pge_num) print(pdf_input.getDocumentInfo()) # 得到文档信息 # 返回PageObject对象 pages_from_row = [ pdf_input.getPage(i) for i in range(pdf_input.getNumPages()) ] # 获取一个PdfFileWriter对象 pdf_output = PdfFileWriter() # 将PageObject添加到PdfFileWriter for page in pages_from_row: pdf_output.addPage(page) # 输出到文件中 pdf_output.write(open('mdbt.pdf', 'wb')) # 合并两个pdf文件 merger = PdfFileMerger() merger.append(PdfFileReader(open('mdb.pdf', 'rb'))) merger.append(PdfFileReader(open('mdbt.pdf', 'rb'))) merger.write(open('mdb-merger.pdf', 'wb'))
def get_metadata(path): # Checks if Scanned PDF (Needs to be added) text = "" curr_page = 0 with open(path, 'rb') as f: pdf = PdfFileReader(f) info = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() while curr_page < number_of_pages: page = pdf.getPage(curr_page) curr_page += 1 text += page.extractText() metadata = {} metadata['author'] = info.author metadata['creator'] = info.creator metadata['producer'] = info.producer metadata['subject'] = info.subject metadata['title'] = info.title metadata['numpages'] = number_of_pages metadata['summary'] = generate_summary(text) return metadata
def get_metadata(filename): # reading data from pdf file fin = open(filename, 'rb') # initializing pyPDF2 reader = PdfFileReader(fin) metadata = reader.getDocumentInfo() number_of_pages = reader.getNumPages() # Preparing data author = metadata.author creator = metadata.creator producer = metadata.producer subject = metadata.subject title = metadata.title pages = number_of_pages with open(filename + "_Output.txt", "w") as text_file: # print(metadata, file=text_file) pickle.dump(metadata, text_file) fin.close() pprint.pprint(metadata) print("Metadata has been saved to " + filename + "_Output.txt")
def print_pdf(file_full_path): # Header with file path cprint("[+] Metadata for file: %s " % (file_full_path), "green", attrs=['bold']) # Open the file pdf_file = PdfFileReader(file(file_full_path, 'rb')) # Create a dictorionary with the info pdf_info = pdf_file.getDocumentInfo() # Print metadata if pdf_info: for metaItem in pdf_info: try: cprint('\t ' + metaItem[1:] + ': ', 'cyan', end="") cprint(pdf_info[metaItem]) except TypeError: cprint( '\t ' + metaItem[1:] + ': ' + 'Error - Item not redeable', 'red') else: cprint('Not data found', 'red') # Print other info cprint("\t Number of pages: %s" % pdf_file.getNumPages(), 'cyan') cprint("\t Is Encripted: %s" % pdf_file.getIsEncrypted(), 'cyan')
def parse_api(self, response): raw_data = response.body data = json.loads(raw_data) for issue in data["issues"]: for key in issue: if key == 'id': link = 'https://www.myeblaettle.de/frontend/catalogs/' + str( issue[key]) + '/1/pdf/complete.pdf' print(link) # path = "/Users/mr/Documents/scrapy_tutorial/Routines/2021-02-24 (34).pdf" with open(link, "rb") as f: pdf = PdfFileReader(f) info = pdf.getDocumentInfo() number_pages = pdf.getNumPages() # date = datetime.strptime(info["/ModDate"],'%y/%m/%d %H:%M:%S') print(info) print(date) else: pass yield {"linkid": link}
def pdf_print(self): file_list = self.__generate_pdfs() merger = PdfFileMerger() for pdf in file_list: doc = PdfFileReader(pdf) info = doc.getDocumentInfo() merger.append(pdf, bookmark=info["/Title"].split(" - Pharmaship")[0]) # Create a temporary file tmp_file = tempfile.NamedTemporaryFile( prefix="pharmaship_all_", suffix=".pdf", delete=False ) merger.write(tmp_file.name) merger.close() # Cleanup for pdf in file_list: Path(pdf).unlink() return tmp_file.name
def verify(self): """Verify signature of the current document Returns None if there is no signature, False if it is invalid and True if it is valid. """ file_in = open(self.file, 'rb') pdf_reader = PdfFileReader(file_in) metadata = pdf_reader.getDocumentInfo().copy() if '/Signature' not in metadata: return None serialized_signature = metadata['/Signature'] stored_signature = Signature.from_serialized(serialized_signature, self.w3) if stored_signature == None: return False calculated_signature = Signature( stored_signature.name, self._get_hash(stored_signature.name), stored_signature.transaction, ) if stored_signature == calculated_signature: return f'''Signed by **{stored_signature.name}** **Date and time**: {stored_signature.date} **From address**: {stored_signature.address} **Transaction hash**: {stored_signature.transaction} ''' else: return False