def do(filename=''): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) # doc.set_parser(parser) # doc.initialize() print doc.info # The "Info" metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() print metadata # The raw XMP metadata print xmp_to_dict(metadata) return doc,doc.info[0]
def do(filename=''): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) # doc.set_parser(parser) # doc.initialize() print doc.info # The "Info" metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() print metadata # The raw XMP metadata print xmp_to_dict(metadata) return doc, doc.info[0]
def _get_xmp_metadata(self): t = a = None metadata = resolve1(self.doc.catalog["Metadata"]).get_data() try: md = xmp_to_dict(metadata) except: return t, a try: t = md["dc"]["title"]["x-default"] except KeyError: pass try: a = md["dc"]["creator"] except KeyError: pass else: if type(a) is bytes: a = a.decode("utf-8") if type(a) is str: a = [a] a = list(filter(bool, a)) # remove None, empty strings, ... if len(a) > 1: a = " ".join((self._au_last_name(a[0]), self._au_last_name(a[-1]))) elif len(a) == 1: a = self._au_last_name(a[0]) else: a = None return t, a
def _get_xmp_metadata(self): t = a = None metadata = resolve1(self.doc.catalog['Metadata']).get_data() try: md = xmp_to_dict(metadata) except: return t, a try: t = md['dc']['title']['x-default'] except KeyError: pass try: a = md['dc']['creator'] except KeyError: pass else: if type(a) is bytes: a = a.decode('utf-8') if type(a) is str: a = [a] a = list(filter(bool, a)) # remove None, empty strings, ... if len(a) > 1: a = '%s %s' % (self._au_last_name( a[0]), self._au_last_name(a[-1])) elif len(a) == 1: a = self._au_last_name(a[0]) else: a = None return t, a
def _get_xmp_metadata(self): t = a = None metadata = resolve1(self.doc.catalog['Metadata']).get_data() try: md = xmp_to_dict(metadata) except: return t, a try: t = md['dc']['title']['x-default'] except KeyError: pass try: a = md['dc']['creator'] except KeyError: pass else: if type(a) is str: a = [a] a = filter(bool, a) # remove None, empty strings, ... if len(a) > 1: a = '%s %s' % (self._au_last_name(a[0]), self._au_last_name(a[-1])) elif len(a) == 1: a = self._au_last_name(a[0]) else: a = None return t, a
def proc(self, pdfFp): """Get meta-data as available from a PDF document""" parser = PDFParser(pdfFp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() self.info = doc.info if 'Metadata' in doc.catalog: self.metadata = xmp_to_dict( resolve1(doc.catalog['Metadata']).get_data() ) self.raw_doc = pdfFp.getvalue()
def _get_xmp_metadata(self): t = a = None try: metadata = resolve1(self.doc.catalog['Metadata']).get_data() except: return t, a try: md = xmp_to_dict(metadata) except: return t, a try: t = md['dc']['title']['x-default'] except TypeError: # The 'title' field might be a string or bytes instead of a dict # https://github.com/jdmonaco/pdf-title-rename/issues/7 titleval = md['dc']['title'] if type(titleval) is str: t = titleval elif type(titleval) is bytes: t = titleval.decode() except KeyError: pass try: a = md['dc']['creator'] except KeyError: pass else: if type(a) is bytes: a = a.decode('utf-8') if type(a) is str: a = [a] a = list(filter(bool, a)) # remove None, empty strings, ... if len(a) > 1: a = '%s %s' % (self._au_last_name( a[0]), self._au_last_name(a[-1])) elif len(a) == 1 and not a[0].isspace(): a = self._au_last_name(a[0]) else: a = None return t, a
def _get_xmp_metadata(self): t = a = None metadata = resolve1(self.doc.catalog['Metadata']).get_data() try: md = xmp_to_dict(metadata) except: return t, a try: t = md['dc']['title']['x-default'] except TypeError: # The 'title' field might be a string or bytes instead of a dict # https://github.com/jdmonaco/pdf-title-rename/issues/7 titleval = md['dc']['title'] if type(titleval) is str: t = titleval elif type(titleval) is bytes: t = titleval.decode() except KeyError: pass try: a = md['dc']['creator'] except KeyError: pass else: if type(a) is bytes: a = a.decode('utf-8') if type(a) is str: a = [a] a = list(filter(bool, a)) # remove None, empty strings, ... if len(a) > 1: a = '%s %s' % (self._au_last_name(a[0]), self._au_last_name(a[-1])) elif len(a) == 1: a = self._au_last_name(a[0]) else: a = None return t, a
return False # loop through directories for subdir, dirs, files in os.walk(arg_path): for file in files: file_count += 1 filepath = subdir + os.sep + file if filepath.endswith(".pdf"): pdffilecount += 1 try: pdfdoc = parsePDFfile(filepath) if checkMetadata(pdfdoc): metadata = resolve1(pdfdoc.catalog['Metadata']).get_data() dirname = subdir.split(os.path.sep)[-1] pdfdict = xmp_to_dict(metadata) dict1 = pdfdoc.info[0] xkeywords = None xdesc = None xcreator = None xtitle = None xfolder = None try: xkeywords = str(pdfdict['pdf']['Keywords']).replace( '\r\n', ', ') except: xkeywords = '' pass try: xdesc = pdfdict['dc']['description']['x-default'] except:
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import resolve1 from xmp import xmp_to_dict fp = open( 'c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/methods_of_web_philology.pdf', 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) print(doc.info) # The "Info" metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() print(metadata) # The raw XMP metadata print(xmp_to_dict(metadata)) """ c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/Haltermanpythonbook.pdf c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/methods_of_web_philology.pdf c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/pdf_wiki.pdf """