def get_file_mimetype(file): try: mimeTypeFile = exiftool.ExifToolHelper().get_metadata(file.name)[0]["File:MIMEType"] if mimeTypeFile == "application/pdf": #Check is PDFA and Version with open(file.name, mode="rb") as fileData: input = PdfFileReader(fileData, strict=False) try: metadata = input.getXmpMetadata() if metadata: pdfa=app.config["PDFA"] nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"]) if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]: mimeTypeFile = "application/pdfa" except (ExpatError): app.logger.log(logging.WARNING, "File {0} has not well-formed XMP data, could not verify if application/pdf has PDF/A1 DOCINFO.".format(file.name)) elif mimeTypeFile in app.config["GENERIC_MIMETYPES"]: mimeTypeFile = magic.from_file(file.name, mime=True) if mimeTypeFile in app.config["GENERIC_MIMETYPES"]: with open(file.name, mode="rb") as fileData: documentTypeFile = magic.from_buffer(fileData.read(2048)) for (fileMimetype, fileFormat) in itertools.zip_longest(app.config["FILEMIMETYPES"], app.config["FILEFORMATS"]): if documentTypeFile in fileFormat: mimeTypeFile = fileMimetype except (ValueError, PdfReadError): mimeTypeFile = "Unknown/Corrupted" return mimeTypeFile
def has_PDFA_XMP(file): try: with open(file, mode="rb") as fileData: xmpfile = PdfFileReader(fileData, strict=False) metadata = xmpfile.getXmpMetadata() if metadata is not None: pdfa=app.config["PDFA"] nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"]) if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]: return True return False except: return False
def pdfHandler(file_dir): input1 = PdfFileReader(open(file_dir, 'rb')) print('document1.pdf has {} pages.'.format(str(input1.getNumPages()))) fields = input1.getFields() print(type(fields)) documentInfo = input1.getDocumentInfo() print(type(documentInfo)) if documentInfo is not None: for key in documentInfo.keys(): print('{} : {}'.format(key, documentInfo.get(key))) metaData = input1.getXmpMetadata() print(type(metaData)) if metaData is not None: # print(metaData) for relation in metaData.dc_relation: print('relation: {}'.format(relation))