def do(filename=''):
	fp = open(filename, 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument(parser)
	parser.set_document(doc)
#	doc.set_parser(parser)
#	doc.initialize()

	print doc.info        # The "Info" metadata

	if 'Metadata' in doc.catalog:
	    metadata = resolve1(doc.catalog['Metadata']).get_data()
	    print metadata  # The raw XMP metadata
	    print xmp_to_dict(metadata)
	return doc,doc.info[0]
def do(filename=''):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    #	doc.set_parser(parser)
    #	doc.initialize()

    print doc.info  # The "Info" metadata

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        print metadata  # The raw XMP metadata
        print xmp_to_dict(metadata)
    return doc, doc.info[0]
Example #3
0
    def _get_xmp_metadata(self):
        t = a = None
        metadata = resolve1(self.doc.catalog["Metadata"]).get_data()
        try:
            md = xmp_to_dict(metadata)
        except:
            return t, a

        try:
            t = md["dc"]["title"]["x-default"]
        except KeyError:
            pass

        try:
            a = md["dc"]["creator"]
        except KeyError:
            pass
        else:
            if type(a) is bytes:
                a = a.decode("utf-8")
            if type(a) is str:
                a = [a]
            a = list(filter(bool, a))  # remove None, empty strings, ...
            if len(a) > 1:
                a = " ".join((self._au_last_name(a[0]), self._au_last_name(a[-1])))
            elif len(a) == 1:
                a = self._au_last_name(a[0])
            else:
                a = None

        return t, a
    def _get_xmp_metadata(self):
        t = a = None
        metadata = resolve1(self.doc.catalog['Metadata']).get_data()
        try:
            md = xmp_to_dict(metadata)
        except:
            return t, a

        try:
            t = md['dc']['title']['x-default']
        except KeyError:
            pass

        try:
            a = md['dc']['creator']
        except KeyError:
            pass
        else:
            if type(a) is bytes:
                a = a.decode('utf-8')
            if type(a) is str:
                a = [a]
            a = list(filter(bool, a))  # remove None, empty strings, ...
            if len(a) > 1:
                a = '%s %s' % (self._au_last_name(
                    a[0]), self._au_last_name(a[-1]))
            elif len(a) == 1:
                a = self._au_last_name(a[0])
            else:
                a = None

        return t, a
Example #5
0
 def _get_xmp_metadata(self):
     t = a = None
     metadata = resolve1(self.doc.catalog['Metadata']).get_data()
     try:
         md = xmp_to_dict(metadata)
     except:
         return t, a
     try:
         t = md['dc']['title']['x-default']
     except KeyError:
         pass
     try:
         a = md['dc']['creator']
     except KeyError:
         pass
     else:
         if type(a) is str:
             a = [a]
         a = filter(bool, a)  # remove None, empty strings, ...
         if len(a) > 1:
             a = '%s %s' % (self._au_last_name(a[0]),
                     self._au_last_name(a[-1]))
         elif len(a) == 1:
             a = self._au_last_name(a[0])
         else:
             a = None
     return t, a
Example #6
0
    def proc(self, pdfFp):
        """Get meta-data as available from a PDF document"""

        parser = PDFParser(pdfFp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        doc.initialize()
        self.info = doc.info
        if 'Metadata' in doc.catalog:
            self.metadata = xmp_to_dict(
                resolve1(doc.catalog['Metadata']).get_data()
            )
        self.raw_doc = pdfFp.getvalue()
Example #7
0
    def _get_xmp_metadata(self):
        t = a = None
        try:
            metadata = resolve1(self.doc.catalog['Metadata']).get_data()
        except:
            return t, a
        try:
            md = xmp_to_dict(metadata)
        except:
            return t, a

        try:
            t = md['dc']['title']['x-default']
        except TypeError:
            # The 'title' field might be a string or bytes instead of a dict
            # https://github.com/jdmonaco/pdf-title-rename/issues/7
            titleval = md['dc']['title']
            if type(titleval) is str:
                t = titleval
            elif type(titleval) is bytes:
                t = titleval.decode()
        except KeyError:
            pass

        try:
            a = md['dc']['creator']
        except KeyError:
            pass
        else:
            if type(a) is bytes:
                a = a.decode('utf-8')
            if type(a) is str:
                a = [a]
            a = list(filter(bool, a))  # remove None, empty strings, ...
            if len(a) > 1:
                a = '%s %s' % (self._au_last_name(
                    a[0]), self._au_last_name(a[-1]))
            elif len(a) == 1 and not a[0].isspace():
                a = self._au_last_name(a[0])
            else:
                a = None

        return t, a
    def _get_xmp_metadata(self):
        t = a = None
        metadata = resolve1(self.doc.catalog['Metadata']).get_data()
        try:
            md = xmp_to_dict(metadata)
        except:
            return t, a

        try:
            t = md['dc']['title']['x-default']
        except TypeError:
            # The 'title' field might be a string or bytes instead of a dict
            # https://github.com/jdmonaco/pdf-title-rename/issues/7
            titleval = md['dc']['title']
            if type(titleval) is str:
                t = titleval
            elif type(titleval) is bytes:
                t = titleval.decode()
        except KeyError:
            pass

        try:
            a = md['dc']['creator']
        except KeyError:
            pass
        else:
            if type(a) is bytes:
                a = a.decode('utf-8')
            if type(a) is str:
                a = [a]
            a = list(filter(bool, a))  # remove None, empty strings, ...
            if len(a) > 1:
                a = '%s %s' % (self._au_last_name(a[0]), self._au_last_name(a[-1]))
            elif len(a) == 1:
                a = self._au_last_name(a[0])
            else:
                a = None

        return t, a
Example #9
0
    return False


# loop through directories
for subdir, dirs, files in os.walk(arg_path):
    for file in files:
        file_count += 1
        filepath = subdir + os.sep + file
        if filepath.endswith(".pdf"):
            pdffilecount += 1
        try:
            pdfdoc = parsePDFfile(filepath)
            if checkMetadata(pdfdoc):
                metadata = resolve1(pdfdoc.catalog['Metadata']).get_data()
                dirname = subdir.split(os.path.sep)[-1]
                pdfdict = xmp_to_dict(metadata)
                dict1 = pdfdoc.info[0]
                xkeywords = None
                xdesc = None
                xcreator = None
                xtitle = None
                xfolder = None
                try:
                    xkeywords = str(pdfdict['pdf']['Keywords']).replace(
                        '\r\n', ', ')
                except:
                    xkeywords = ''
                    pass
                try:
                    xdesc = pdfdict['dc']['description']['x-default']
                except:
Example #10
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from xmp import xmp_to_dict

fp = open(
    'c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/methods_of_web_philology.pdf',
    'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)

print(doc.info)  # The "Info" metadata

if 'Metadata' in doc.catalog:
    metadata = resolve1(doc.catalog['Metadata']).get_data()
    print(metadata)  # The raw XMP metadata
    print(xmp_to_dict(metadata))
"""
c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/Haltermanpythonbook.pdf
c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/methods_of_web_philology.pdf
c:/Users/mihal/OneDrive/Documents/pyprj/pdf-metadata-master/example/docs/pdf_wiki.pdf
"""