Ejemplo n.º 1
0
def updateSizeAndChecksum(fileUUID,
                          filePath,
                          date,
                          eventIdentifierUUID,
                          fileSize=None,
                          checksum=None,
                          checksumType=None,
                          add_event=True):
    """
    Update a File with its size, checksum and checksum type. These are
    parameters that can be either generated or provided via keywords.

    Finally, insert the corresponding Event. This behavior can be cancelled
    using the boolean keyword 'add_event'.
    """
    if not fileSize:
        fileSize = os.path.getsize(filePath)
    if not checksumType:
        checksumType = get_setting('checksum_type', 'sha256')
    if not checksum:
        checksum = get_file_checksum(filePath, checksumType)

    File.objects.filter(uuid=fileUUID).update(size=fileSize,
                                              checksum=checksum,
                                              checksumtype=checksumType)

    if add_event:
        insertIntoEvents(
            fileUUID=fileUUID,
            eventType='message digest calculation',
            eventDateTime=date,
            eventDetail='program="python"; module="hashlib.{}()"'.format(
                checksumType),
            eventOutcomeDetailNote=checksum)
def verifyMetsFileSecChecksums(job, metsFile, date, taskUUID, relativeDirectory="./"):
    job.print_output(metsFile)
    exitCode = 0
    tree = etree.parse(metsFile)
    root = tree.getroot()
    for item in root.findall("{http://www.loc.gov/METS/}fileSec/{http://www.loc.gov/METS/}fileGrp/{http://www.loc.gov/METS/}file"):
        checksum = item.get("CHECKSUM")
        checksumType = item.get('CHECKSUMTYPE', '').lower()

        for item2 in item:
            if item2.tag == "{http://www.loc.gov/METS/}FLocat":
                fileLocation = item2.get("{http://www.w3.org/1999/xlink}href")

        fileFullPath = os.path.join(relativeDirectory, fileLocation)

        if checksumType and checksumType in hashlib.algorithms:
            checksum2 = get_file_checksum(fileFullPath, checksumType)
            # eventDetail = 'program="python"; module="hashlib.{}()"'.format(checksumType)
        else:
            job.pyprint("Unsupported checksum type: %s" % (checksumType.__str__()), file=sys.stderr)
            return 300

        if checksum != checksum2:
            eventOutcome = "Fail"
            job.print_output("%s - %s - %s" % ((checksum == checksum2).__str__(), checksum.__str__(), checksum2.__str__()))
            job.print_error(eventOutcome, fileFullPath)
            exitCode = exitCode + 22
        else:
            eventOutcome = "Pass"
            job.print_output(eventOutcome, fileLocation)

    return exitCode
Ejemplo n.º 3
0
def verify_checksum(
    job, file_uuid, path, checksum, checksumtype, event_id=None, date=None
):
    """
    Verify the checksum of a given file, and create a fixity event.

    :param str file_uuid: UUID of the file to verify
    :param str path: Path of the file to verify
    :param str checksum: Checksum to compare against
    :param str checksumtype: Type of the provided checksum (md5, sha256, etc)
    :param str event_id: Event ID
    :param str date: Date of the event
    """
    if event_id is None:
        event_id = str(uuid.uuid4())
    if date is None:
        date = timezone.now().isoformat(" ")

    checksumtype = checksumtype.lower()
    generated_checksum = get_file_checksum(path, checksumtype)
    event_detail = 'program="python"; ' 'module="hashlib.{}()"'.format(checksumtype)
    if checksum != generated_checksum:
        job.pyprint("Checksum failed")
        event_outcome = "Fail"
        detail_note = "Dataverse checksum %s verification failed" % checksum
    else:
        job.pyprint("Checksum passed")
        event_outcome = "Pass"
        detail_note = "Dataverse checksum %s verified" % checksum

    databaseFunctions.insertIntoEvents(
        fileUUID=file_uuid,
        eventIdentifierUUID=event_id,
        eventType="fixity check",
        eventDateTime=date,
        eventDetail=event_detail,
        eventOutcome=event_outcome,
        eventOutcomeDetailNote=detail_note,
    )
def verifyChecksum(fileUUID, filePath, date, eventIdentifierUUID):
    f = File.objects.get(uuid=fileUUID)

    if f.checksum in ('', 'None'):
        print('No checksum found in database for file:',
              fileUUID,
              filePath,
              file=sys.stderr)
        exit(1)

    checksumFile = get_file_checksum(filePath, f.checksumtype)

    eventOutcome = ''
    eventOutcomeDetailNote = ''
    exitCode = 0

    if checksumFile != f.checksum:
        eventOutcomeDetailNote = str(checksumFile) + ' != ' + f.checksum
        eventOutcome = 'Fail'
        exitCode = 2
        print('Checksums do not match:', fileUUID, filePath, file=sys.stderr)
        print(eventOutcomeDetailNote, file=sys.stderr)
    else:
        eventOutcomeDetailNote = '%s %s' % (str(checksumFile), 'verified')
        eventOutcome = 'Pass'
        exitCode = 0

    databaseFunctions.insertIntoEvents(
        fileUUID=fileUUID,
        eventIdentifierUUID=str(uuid.uuid4()),
        eventType='fixity check',
        eventDateTime=date,
        eventOutcome=eventOutcome,
        eventOutcomeDetailNote=eventOutcomeDetailNote,
        eventDetail='program="python"; module="hashlib.{}()"'.format(
            f.checksumtype))

    exit(exitCode)
Ejemplo n.º 5
0
def call(jobs):
    with transaction.atomic():
        for job in jobs:
            with job.JobContext():
                # job.args[2] (transferName) is unused.
                transferUUID = job.args[1]
                transferPath = job.args[3]
                date = job.args[4]
                exitCode = 0

                for transfer_dir in os.listdir(transferPath):
                    dirPath = os.path.join(transferPath, transfer_dir)
                    if not os.path.isdir(dirPath):
                        continue
                    for transfer_file in os.listdir(dirPath):
                        filePath = os.path.join(dirPath, transfer_file)
                        if transfer_file == 'ContainerMetadata.xml' or transfer_file.endswith(
                                'Metadata.xml'
                        ) or not os.path.isfile(filePath):
                            continue

                        i = transfer_file.rfind('.')
                        if i != -1:
                            xmlFile = transfer_file[:i] + '_Metadata.xml'
                        else:
                            xmlFile = transfer_file + '_Metadata.xml'
                        xmlFilePath = os.path.join(dirPath, xmlFile)
                        try:
                            tree = etree.parse(xmlFilePath)
                            root = tree.getroot()

                            xmlMD5 = root.find('Document/MD5').text
                        except:
                            job.pyprint('Error parsing: ',
                                        xmlFilePath,
                                        file=sys.stderr)
                            exitCode += 1
                            continue

                        objectMD5 = get_file_checksum(filePath, 'md5')

                        if objectMD5 == xmlMD5:
                            job.pyprint(
                                'File OK: ', xmlMD5,
                                filePath.replace(transferPath,
                                                 '%TransferDirectory%'))

                            fileID = getFileUUIDLike(filePath, transferPath,
                                                     transferUUID, 'transfer',
                                                     '%transferDirectory%')
                            for path, fileUUID in fileID.items():
                                eventDetail = 'program="python"; module="hashlib.md5()"'
                                eventOutcome = 'Pass'
                                eventOutcomeDetailNote = '%s %s' % (
                                    xmlFile.__str__(), 'verified')
                                eventIdentifierUUID = uuid.uuid4().__str__()

                                databaseFunctions.insertIntoEvents(
                                    fileUUID=fileUUID,
                                    eventIdentifierUUID=eventIdentifierUUID,
                                    eventType='fixity check',
                                    eventDateTime=date,
                                    eventOutcome=eventOutcome,
                                    eventOutcomeDetailNote=
                                    eventOutcomeDetailNote,
                                    eventDetail=eventDetail)
                        else:
                            job.pyprint('Checksum mismatch: ',
                                        filePath.replace(
                                            transferPath,
                                            '%TransferDirectory%'),
                                        file=sys.stderr)
                            exitCode += 1

                job.set_status(exitCode)
Ejemplo n.º 6
0
def main(aip_uuid, aip_name, compression, sip_dir, aip_filename):

    # Prep work
    mets_schema_location = 'http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd'
    premis_schema_location = 'info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-2.xsd'
    # Datetime format string from http://docs.python.org/2/library/datetime.html
    # %Y = 4 digit year, %m = 2 digit month, %d = 2 digit day
    # %H = 24-hour hour, %M = 2-digit minute, %S = 2 digit second
    now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S")
    aip_identifier = aip_name+'-'+aip_uuid
    aip_path = os.path.join(sip_dir, aip_filename)
    # Get archive tool and version
    program, algorithm = compression.split('-')

    # Pointer files are not written for uncompressed AIPs;
    # the purpose of the pointer file is primarily to provide information
    # on how to read a compressed AIP file, so there isn't anything for
    # it to do when pointing at an uncompressed AIP.
    if program == 'None':
        return 0

    if program == '7z':
        archive_tool = '7-Zip'
        archive_tool_version = '9.20'  # TODO get this dynamically
    elif program == 'pbzip2':
        archive_tool = program
        archive_tool_version = '1.1.6'  # TODO get this dynamically
    # Format / file extension
    _, extension = os.path.splitext(aip_filename)
    # PRONOM ID and PRONOM name for each file extension
    pronom_conversion = {
        '.7z': {'puid': 'fmt/484', 'name': '7Zip format'},
        '.bz2': {'puid': 'x-fmt/268', 'name': 'BZIP2 Compressed Archive'},
    }
    num_files = 1
    # Get size
    try:
        aip_size = os.path.getsize(aip_path)
    except os.error:
        print("File {} does not exist or is inaccessible.  Aborting.".format(aip_path), file=sys.stderr)
        return -1
    # Calculate checksum
    checksum_algorithm = get_setting('checksum_type', 'sha256')
    checksum = get_file_checksum(aip_path, checksum_algorithm)
    # Get package type (AIP, AIC)
    sip_metadata_uuid = '3e48343d-e2d2-4956-aaa3-b54d26eb9761'

    try:
        dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid,
                                    metadataappliestoidentifier=aip_uuid)
    except DublinCore.DoesNotExist:
        package_type = "Archival Information Package"
    else:
        package_type = dc.type

    # Namespaces
    nsmap = {
        # Default, unprefixed namespace
        'mets': namespaces.metsNS,
        'xsi': namespaces.xsiNS,
        'xlink': namespaces.xlinkNS,
    }
    # Set up structure
    E = ElementMaker(namespace=namespaces.metsNS, nsmap=nsmap)
    E_P = ElementMaker(namespace=namespaces.premisNS, nsmap={'premis': namespaces.premisNS})

    root = (
        E.mets(
            E.metsHdr(CREATEDATE=now),
            # amdSec goes here
            E.fileSec(
                E.fileGrp(USE='Archival Information Package'),
            ),
            E.structMap(
                TYPE='physical'
            ),
        )
    )
    # Namespaced attributes have to be added separately - don't know how to do
    # inline with E
    root.attrib[namespaces.xsiBNS+'schemaLocation'] = mets_schema_location

    add_amdsec_after = root.find('mets:metsHdr', namespaces=namespaces.NSMAP)
    filegrp = root.find('.//mets:fileGrp', namespaces=namespaces.NSMAP)
    structmap = root.find('.//mets:structMap', namespaces=namespaces.NSMAP)
    # For each file, add amdSec, file, fptr
    for admin_id in range(1, num_files+1):

        # amdSec
        amdsec_id = 'amdSec_{}'.format(admin_id)
        amdsec = E.amdSec(
            E.techMD(
                E.mdWrap(
                    E.xmlData(
                    ),
                    MDTYPE='PREMIS:OBJECT',  # mdWrap
                ),
                ID='techMD_1',  # techMD
            ),
            ID=amdsec_id,  # amdSec
        )
        # Add PREMIS:OBJECT
        obj = E_P.object(
            E_P.objectIdentifier(
                E_P.objectIdentifierType('UUID'),
                E_P.objectIdentifierValue(aip_uuid),
            ),
            E_P.objectCharacteristics(
                E_P.compositionLevel('1'),
                E_P.fixity(
                    E_P.messageDigestAlgorithm(checksum_algorithm),
                    E_P.messageDigest(checksum),
                ),
                E_P.size(str(aip_size)),
                E_P.format(
                    E_P.formatDesignation(
                        E_P.formatName(
                            pronom_conversion[extension]['name']),
                        E_P.formatVersion(),
                    ),
                    E_P.formatRegistry(
                        E_P.formatRegistryName('PRONOM'),
                        E_P.formatRegistryKey(
                            pronom_conversion[extension]['puid'])
                    ),
                ),
                E_P.creatingApplication(
                    E_P.creatingApplicationName(archive_tool),
                    E_P.creatingApplicationVersion(archive_tool_version),
                    E_P.dateCreatedByApplication(now),
                ),
            ),
            version='2.2',
        )
        obj.attrib[namespaces.xsiBNS+'type'] = 'premis:file'
        obj.attrib[namespaces.xsiBNS+'schemaLocation'] = premis_schema_location

        # Add as child of xmldata
        amdsec.find('.//mets:mdWrap[@MDTYPE="PREMIS:OBJECT"]/mets:xmlData', namespaces=namespaces.NSMAP).append(obj)

        # Add PREMIS:EVENT for compression & PREMIS:AGENTs
        # use archivematicaCreateMETS2 code
        elements = archivematicaCreateMETS2.createDigiprovMD(aip_uuid)
        for element in elements:
            amdsec.append(element)

        # add amdSec after previous amdSec (or metsHdr if first one)
        add_amdsec_after.addnext(amdsec)
        add_amdsec_after = amdsec

        # fileGrp
        file_ = E.file(
            E.FLocat(
                LOCTYPE="OTHER",
                OTHERLOCTYPE="SYSTEM",
            ),
            ID=aip_identifier
        )
        filegrp.append(file_)
        flocat = file_.find('mets:FLocat', namespaces=namespaces.NSMAP)
        flocat.attrib['{{{ns}}}href'.format(ns=namespaces.xlinkNS)] = aip_path

        # compression - 7z or tar.bz2
        if extension == '.7z':
            etree.SubElement(file_, namespaces.metsBNS + "transformFile",
                TRANSFORMORDER='1',
                TRANSFORMTYPE='decompression',
                TRANSFORMALGORITHM=algorithm)
        elif extension == '.bz2':
            etree.SubElement(file_, namespaces.metsBNS + "transformFile",
                TRANSFORMORDER='1',
                TRANSFORMTYPE='decompression',
                TRANSFORMALGORITHM='bzip2')
            etree.SubElement(file_, namespaces.metsBNS + "transformFile",
                TRANSFORMORDER='2',
                TRANSFORMTYPE='decompression',
                TRANSFORMALGORITHM='tar')

        # structMap
        div = etree.SubElement(structmap, namespaces.metsBNS+'div', ADMID=amdsec_id, TYPE=package_type)
        etree.SubElement(div, namespaces.metsBNS+'fptr', FILEID=aip_identifier)

    print(etree.tostring(root, pretty_print=True))

    # Write out pointer.xml
    xml_filename = 'pointer.xml'
    filename = os.path.join(os.path.dirname(aip_path), xml_filename)
    with open(filename, 'w') as f:
        f.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='utf-8'))
    fileOperations.addFileToSIP(
        filePathRelativeToSIP='%SIPDirectory%'+xml_filename,
        fileUUID=str(uuid.uuid4()),
        sipUUID=aip_uuid,
        taskUUID=str(uuid.uuid4()),  # Unsure what should go here
        date=now,
        sourceType="aip creation",
    )
    return 0
        if i != -1:
            xmlFile = transfer_file[:i] + '_Metadata.xml'
        else:
            xmlFile = transfer_file + '_Metadata.xml'
        xmlFilePath = os.path.join(dirPath, xmlFile)
        try:
            tree = etree.parse(xmlFilePath)
            root = tree.getroot()

            xmlMD5 = root.find('Document/MD5').text
        except:
            print('Error parsing: ', xmlFilePath, file=sys.stderr)
            exitCode += 1
            continue

        objectMD5 = get_file_checksum(filePath, 'md5')

        if objectMD5 == xmlMD5:
            print('File OK: ', xmlMD5, filePath.replace(transferPath, '%TransferDirectory%'))

            fileID = getFileUUIDLike(filePath, transferPath, transferUUID, 'transfer', '%transferDirectory%')
            for path, fileUUID in fileID.items():
                eventDetail = 'program="python"; module="hashlib.md5()"'
                eventOutcome = 'Pass'
                eventOutcomeDetailNote = '%s %s' % (xmlFile.__str__(), 'verified')
                eventIdentifierUUID = uuid.uuid4().__str__()

                databaseFunctions.insertIntoEvents(
                    fileUUID=fileUUID,
                    eventIdentifierUUID=eventIdentifierUUID,
                    eventType='fixity check',