Exemple #1
0
def get_file_mimetype(file):
    try:   
        mimeTypeFile = exiftool.ExifToolHelper().get_metadata(file.name)[0]["File:MIMEType"]
        if mimeTypeFile == "application/pdf":
            #Check is PDFA and Version
            with open(file.name, mode="rb") as fileData:
                input = PdfFileReader(fileData, strict=False)
                try:
                    metadata = input.getXmpMetadata()
                    if metadata:
                        pdfa=app.config["PDFA"]
                        nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"])
                        if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]:
                            mimeTypeFile = "application/pdfa"
                except (ExpatError):
                    app.logger.log(logging.WARNING, "File {0} has not well-formed XMP data, could not verify if application/pdf has PDF/A1 DOCINFO.".format(file.name))

        elif mimeTypeFile in app.config["GENERIC_MIMETYPES"]:
            mimeTypeFile = magic.from_file(file.name, mime=True)
            if mimeTypeFile in app.config["GENERIC_MIMETYPES"]:
                with open(file.name, mode="rb") as fileData:
                    documentTypeFile = magic.from_buffer(fileData.read(2048))
                    for (fileMimetype, fileFormat) in itertools.zip_longest(app.config["FILEMIMETYPES"], app.config["FILEFORMATS"]): 
                        if documentTypeFile in fileFormat:
                            mimeTypeFile = fileMimetype
    except (ValueError, PdfReadError):
        mimeTypeFile = "Unknown/Corrupted"
    return mimeTypeFile
Exemple #2
0
def has_PDFA_XMP(file):
    try:
        with open(file, mode="rb") as fileData:
            xmpfile = PdfFileReader(fileData, strict=False)
            metadata = xmpfile.getXmpMetadata()
            if metadata is not None:
                pdfa=app.config["PDFA"]
                nodes = metadata.getNodesInNamespace("", pdfa["NAMESPACE"])
                if get_pdfa_version(nodes) in pdfa["ACCEPTED_VERSIONS"]:
                    return True
            return False
    except:
        return False
Exemple #3
0
def pdfHandler(file_dir):
    input1 = PdfFileReader(open(file_dir, 'rb'))

    print('document1.pdf has {} pages.'.format(str(input1.getNumPages())))

    fields = input1.getFields()
    print(type(fields))

    documentInfo = input1.getDocumentInfo()
    print(type(documentInfo))
    if documentInfo is not None:
        for key in documentInfo.keys():
            print('{} : {}'.format(key, documentInfo.get(key)))

    metaData = input1.getXmpMetadata()
    print(type(metaData))
    if metaData is not None:
        # print(metaData)
        for relation in metaData.dc_relation:
            print('relation: {}'.format(relation))