Esempio n. 1
0
def call(jobs):
    for job in jobs:
        with job.JobContext(logger=logger):
            try:
                sip_path = job.args[1]

                # Move everything out of data directory
                for item in os.listdir(os.path.join(sip_path, 'data')):
                    src = os.path.join(sip_path, 'data', item)
                    dst = os.path.join(sip_path, item)
                    _move_file(job, src, dst)

                os.rmdir(os.path.join(sip_path, 'data'))

                # Move metadata and logs out of objects if they exist
                objects_path = os.path.join(sip_path, 'objects')
                src = os.path.join(objects_path, 'metadata')
                dst = os.path.join(sip_path, 'metadata')
                _move_file(job, src, dst, exit_on_error=False)

                src = os.path.join(objects_path, 'logs')
                dst = os.path.join(sip_path, 'logs')
                _move_file(job, src, dst, exit_on_error=False)

                # Move anything unexpected to submission documentation
                # Leave objects, metadata, etc
                # Original METS ends up in submissionDocumentation
                subm_doc_path = os.path.join(sip_path, 'metadata',
                                             'submissionDocumentation')
                os.makedirs(subm_doc_path)
                mets_file_path = None
                for item in os.listdir(sip_path):
                    # Leave SIP structure
                    if item in archivematicaFunctions.OPTIONAL_FILES + archivematicaFunctions.REQUIRED_DIRECTORIES:
                        continue
                    src = os.path.join(sip_path, item)
                    dst = os.path.join(subm_doc_path, item)
                    if item.startswith('METS.') and item.endswith('.xml'):
                        mets_file_path = dst
                    _move_file(job, src, dst)

                # Reconstruct any empty directories documented in the METS file under the
                # logical structMap labelled "Normative Directory Structure"
                if mets_file_path:
                    archivematicaFunctions.reconstruct_empty_directories(
                        mets_file_path, objects_path, logger=logger)
                else:
                    logger.info(
                        'Unable to reconstruct empty directories: no METS file'
                        ' could be found in {}'.format(sip_path))

                archivematicaFunctions.create_structured_directory(
                    sip_path,
                    manual_normalization=True,
                    printing=True,
                    printfn=job.pyprint)
            except IOError as err:
                job.print_error(repr(err))
                job.set_status(1)
def restructure_transfer(unit_path):
    # Create required directories
    create_structured_directory(unit_path, printing=True)

    # Move everything else to the objects directory
    for item in os.listdir(unit_path):
        src = os.path.join(unit_path, item)
        dst = os.path.join(unit_path, "objects", '.')
        if os.path.isdir(src) and item not in REQUIRED_DIRECTORIES:
            _move_file(src, dst)
        elif os.path.isfile(src) and item not in OPTIONAL_FILES:
            _move_file(src, dst)
def call(jobs):
    with transaction.atomic():
        for job in jobs:
            with job.JobContext():
                objectsDirectory = job.args[1]
                transferName = job.args[2]
                transferUUID = job.args[3]
                processingDirectory = job.args[4]
                autoProcessSIPDirectory = job.args[5]
                sharedPath = job.argv[6]
                transfer_objects_directory = '%transferDirectory%objects'

                for container in os.listdir(objectsDirectory):
                    sipUUID = uuid.uuid4().__str__()
                    containerPath = os.path.join(objectsDirectory, container)
                    if not os.path.isdir(containerPath):
                        job.pyprint("file (not container) found: ", container, file=sys.stderr)
                        continue

                    sipName = "%s-%s" % (transferName, container)

                    tmpSIPDir = os.path.join(processingDirectory, sipName) + "/"
                    destSIPDir = os.path.join(autoProcessSIPDirectory, sipName) + "/"
                    archivematicaFunctions.create_structured_directory(tmpSIPDir, manual_normalization=True)
                    databaseFunctions.createSIP(destSIPDir.replace(sharedPath, '%sharedPath%'), sipUUID, printfn=job.pyprint)

                    # move the objects to the SIPDir
                    for item in os.listdir(containerPath):
                        shutil.move(os.path.join(containerPath, item), os.path.join(tmpSIPDir, "objects", item))

                    # get the database list of files in the objects directory
                    # for each file, confirm it's in the SIP objects directory, and update the current location/ owning SIP'
                    directory = os.path.join(transfer_objects_directory, container)
                    files = File.objects.filter(removedtime__isnull=True,
                                                currentlocation__startswith=directory,
                                                transfer_id=transferUUID)
                    for f in files:
                        currentPath = databaseFunctions.deUnicode(f.currentlocation).replace(directory, transfer_objects_directory)
                        currentSIPFilePath = currentPath.replace("%transferDirectory%", tmpSIPDir)
                        if os.path.isfile(currentSIPFilePath):
                            f.currentlocation = currentPath.replace("%transferDirectory%", "%SIPDirectory%")
                            f.sip_id = sipUUID
                            f.save()
                        else:
                            job.pyprint("file not found: ", currentSIPFilePath, file=sys.stderr)

                    # moveSIPTo autoProcessSIPDirectory
                    shutil.move(tmpSIPDir, destSIPDir)
        src = os.path.join(sip_path, 'data', item)
        dst = os.path.join(sip_path, item)
        _move_file(src, dst)

    os.rmdir(os.path.join(sip_path, 'data'))

    # Move metadata and logs out of objects if they exist
    src = os.path.join(sip_path, 'objects', 'metadata')
    dst = os.path.join(sip_path, 'metadata')
    _move_file(src, dst, exit_on_error=False)

    src = os.path.join(sip_path, 'objects', 'logs')
    dst = os.path.join(sip_path, 'logs')
    _move_file(src, dst, exit_on_error=False)

    # Move anything unexpected to submission documentation
    # Leave objects, metadata, etc
    # Original METS ends up in submissionDocumentation
    os.makedirs(os.path.join(sip_path, 'metadata', 'submissionDocumentation'))
    for item in os.listdir(sip_path):
        # Leave SIP structure
        if item in archivematicaFunctions.OPTIONAL_FILES + archivematicaFunctions.REQUIRED_DIRECTORIES:
            continue
        src = os.path.join(sip_path, item)
        dst = os.path.join(sip_path, 'metadata', 'submissionDocumentation',
                           item)
        _move_file(src, dst)

    archivematicaFunctions.create_structured_directory(
        sip_path, manual_normalization=True, printing=True)
    autoProcessSIPDirectory = sys.argv[5]
    sharedPath = sys.argv[6]
    transfer_objects_directory = '%transferDirectory%objects'

    for container in os.listdir(objectsDirectory):
        sipUUID = uuid.uuid4().__str__()
        containerPath = os.path.join(objectsDirectory, container)
        if not os.path.isdir(containerPath):
            print >> sys.stderr, "file (not container) found: ", container
            continue

        sipName = "%s-%s" % (transferName, container)

        tmpSIPDir = os.path.join(processingDirectory, sipName) + "/"
        destSIPDir = os.path.join(autoProcessSIPDirectory, sipName) + "/"
        archivematicaFunctions.create_structured_directory(
            tmpSIPDir, manual_normalization=True)
        databaseFunctions.createSIP(
            destSIPDir.replace(sharedPath, '%sharedPath%'), sipUUID)

        # move the objects to the SIPDir
        for item in os.listdir(containerPath):
            shutil.move(os.path.join(containerPath, item),
                        os.path.join(tmpSIPDir, "objects", item))

        # get the database list of files in the objects directory
        # for each file, confirm it's in the SIP objects directory, and update the current location/ owning SIP'
        directory = os.path.join(transfer_objects_directory, container)
        files = File.objects.filter(removedtime__isnull=True,
                                    currentlocation__startswith=directory,
                                    transfer_id=transferUUID)
        for f in files:
def restructure_transfer_aip(unit_path):
    """
    Restructure a transfer that comes from re-ingesting an Archivematica AIP.
    """
    old_bag = os.path.join(unit_path, 'old_bag', '')
    os.makedirs(old_bag)

    # Move everything to old_bag
    for item in os.listdir(unit_path):
        if item == 'old_bag':
            continue
        src = os.path.join(unit_path, item)
        _move_file(src, old_bag)

    # Create required directories
    # - "/logs" and "/logs/fileMeta"
    # - "/metadata" and "/metadata/submissionDocumentation"
    # - "/objects"
    create_structured_directory(unit_path, printing=True)

    # Move /old_bag/data/METS.<UUID>.xml => /metadata/METS.<UUID>.xml
    p = re.compile(r'^METS\..*\.xml$', re.IGNORECASE)
    src = os.path.join(old_bag, 'data')
    for item in os.listdir(src):
        m = p.match(item)
        if m:
            break  # Stop trying after the first match
    src = os.path.join(src, m.group())
    dst = os.path.join(unit_path, 'metadata')
    mets_file_path = dst
    _move_file(src, dst)

    # Move /old_bag/data/objects/metadata/* => /metadata/
    src = os.path.join(old_bag, 'data', 'objects', 'metadata')
    dst = os.path.join(unit_path, 'metadata')
    if os.path.isdir(src):
        for item in os.listdir(src):
            item_path = os.path.join(src, item)
            _move_file(item_path, dst)
        shutil.rmtree(src)

    # Move /old_bag/data/objects/submissionDocumentation/* => /metadata/submissionDocumentation/
    src = os.path.join(old_bag, 'data', 'objects', 'submissionDocumentation')
    dst = os.path.join(unit_path, 'metadata', 'submissionDocumentation')
    for item in os.listdir(src):
        item_path = os.path.join(src, item)
        _move_file(item_path, dst)
    shutil.rmtree(src)

    # Move /old_bag/data/objects/* => /objects/
    src = os.path.join(old_bag, 'data', 'objects')
    objects_path = dst = os.path.join(unit_path, 'objects')
    for item in os.listdir(src):
        item_path = os.path.join(src, item)
        _move_file(item_path, dst)

    # Move /old_bag/processingMCP.xml => /processingMCP.xml
    src = os.path.join(old_bag, 'processingMCP.xml')
    dst = os.path.join(unit_path, 'processingMCP.xml')
    if os.path.isfile(src):
        _move_file(src, dst)

    # Get rid of old_bag
    shutil.rmtree(old_bag)

    # Reconstruct any empty directories documented in the METS file under the
    # logical structMap labelled "Normative Directory Structure"
    reconstruct_empty_directories(mets_file_path, objects_path, logger=logger)
Esempio n. 7
0
def call(jobs):
    with transaction.atomic():
        for job in jobs:
            with job.JobContext():
                objectsDirectory = job.args[1]
                transferName = job.args[2]
                transferUUID = job.args[3]
                processingDirectory = job.args[4]
                autoProcessSIPDirectory = job.args[5]
                sharedPath = job.args[6]
                sipName = transferName

                tmpSIPDir = os.path.join(processingDirectory, sipName) + "/"
                destSIPDir = os.path.join(autoProcessSIPDirectory,
                                          sipName) + "/"
                archivematicaFunctions.create_structured_directory(
                    tmpSIPDir, manual_normalization=False)

                # If transfer is a reingested AIP, then pass that info to the SIP
                sip_type = "SIP"
                sip_uuid = None
                transfer = Transfer.objects.get(uuid=transferUUID)
                if transfer.type == "Archivematica AIP":
                    sip_type = "AIP-REIN"
                    # Use reingested AIP's UUID as the SIP UUID
                    # Get AIP UUID from reingest METS name
                    job.pyprint(
                        "path",
                        os.path.join(objectsDirectory, "..", "metadata"),
                        "listdir",
                        os.listdir(
                            os.path.join(objectsDirectory, "..", "metadata")),
                    )
                    for item in os.listdir(
                            os.path.join(objectsDirectory, "..", "metadata")):
                        if item.startswith("METS"):
                            sip_uuid = item.replace("METS.",
                                                    "").replace(".xml", "")
                job.pyprint("sip_uuid", sip_uuid)
                job.pyprint("sip_type", sip_type)

                # Find out if any ``Directory`` models were created for the source
                # ``Transfer``. If so, this fact gets recorded in the new ``SIP`` model.
                dir_mdls = Directory.objects.filter(
                    transfer_id=transferUUID,
                    currentlocation__startswith="%transferDirectory%objects",
                )
                diruuids = len(dir_mdls) > 0

                # Create row in SIPs table if one doesn't already exist
                lookup_path = destSIPDir.replace(sharedPath, "%sharedPath%")
                try:
                    sip = SIP.objects.get(currentpath=lookup_path)
                    if diruuids:
                        sip.diruuids = True
                        sip.save()
                except SIP.DoesNotExist:
                    sip_uuid = databaseFunctions.createSIP(
                        lookup_path,
                        UUID=sip_uuid,
                        sip_type=sip_type,
                        diruuids=diruuids,
                        printfn=job.pyprint,
                    )
                    sip = SIP.objects.get(uuid=sip_uuid)

                # Set activeAgent using the value in Transfer. This ensures
                # that events generated in Ingest can fall to this value in
                # scenarios where the processing config does not require user
                # interfactions, e.g. in the "automated" processing config.
                try:
                    unit_variable = UnitVariable.objects.get(
                        unittype="Transfer",
                        unituuid=transferUUID,
                        variable="activeAgent",
                    )
                except UnitVariable.DoesNotExist:
                    unit_variable = None
                if unit_variable:
                    try:
                        agent = Agent.objects.get(
                            id=unit_variable.variablevalue)
                    except Agent.DoesNotExist:
                        pass
                    else:
                        sip.update_active_agent(agent.userprofile.user_id)

                # Move the objects to the SIPDir
                for item in os.listdir(objectsDirectory):
                    src_path = os.path.join(objectsDirectory, item)
                    dst_path = os.path.join(tmpSIPDir, "objects", item)
                    # If dst_path already exists and is a directory, shutil.move
                    # will move src_path into it rather than overwriting it;
                    # to avoid incorrectly-nested paths, move src_path's contents
                    # into it instead.
                    if os.path.exists(dst_path) and os.path.isdir(src_path):
                        for subitem in os.listdir(src_path):
                            shutil.move(os.path.join(src_path, subitem),
                                        dst_path)
                    else:
                        shutil.move(src_path, dst_path)

                # Get the ``Directory`` models representing the subdirectories in the
                # objects/ directory. For each subdirectory, confirm it's in the SIP
                # objects/ directory, and update the current location and owning SIP.
                for dir_mdl in dir_mdls:
                    currentPath = databaseFunctions.deUnicode(
                        dir_mdl.currentlocation)
                    currentSIPDirPath = currentPath.replace(
                        "%transferDirectory%", tmpSIPDir)
                    if os.path.isdir(currentSIPDirPath):
                        dir_mdl.currentlocation = currentPath.replace(
                            "%transferDirectory%", "%SIPDirectory%")
                        dir_mdl.sip = sip
                        dir_mdl.save()
                    else:
                        job.pyprint("directory not found: ",
                                    currentSIPDirPath,
                                    file=sys.stderr)

                # Get the database list of files in the objects directory.
                # For each file, confirm it's in the SIP objects directory, and update the
                # current location/ owning SIP'
                files = File.objects.filter(
                    transfer_id=transferUUID,
                    currentlocation__startswith="%transferDirectory%objects",
                    removedtime__isnull=True,
                )
                for f in files:
                    currentPath = databaseFunctions.deUnicode(
                        f.currentlocation)
                    currentSIPFilePath = currentPath.replace(
                        "%transferDirectory%", tmpSIPDir)
                    if os.path.isfile(currentSIPFilePath):
                        f.currentlocation = currentPath.replace(
                            "%transferDirectory%", "%SIPDirectory%")
                        f.sip = sip
                        f.save()
                    else:
                        job.pyprint("file not found: ",
                                    currentSIPFilePath,
                                    file=sys.stderr)

                archivematicaFunctions.create_directories(
                    archivematicaFunctions.MANUAL_NORMALIZATION_DIRECTORIES,
                    basepath=tmpSIPDir,
                )

                # Copy the JSON metadata file, if present; this contains a
                # serialized copy of DC metadata entered in the dashboard UI
                # during the transfer.
                src = os.path.normpath(
                    os.path.join(objectsDirectory, "..", "metadata",
                                 "dc.json"))
                dst = os.path.join(tmpSIPDir, "metadata", "dc.json")
                if os.path.exists(src):
                    shutil.copy(src, dst)

                # Copy processingMCP.xml file
                src = os.path.join(os.path.dirname(objectsDirectory[:-1]),
                                   "processingMCP.xml")
                dst = os.path.join(tmpSIPDir, "processingMCP.xml")
                shutil.copy(src, dst)

                # moveSIPTo autoProcessSIPDirectory
                shutil.move(tmpSIPDir, destSIPDir)
Esempio n. 8
0
def call(jobs):
    with transaction.atomic():
        for job in jobs:
            with job.JobContext():
                objectsDirectory = job.args[1]
                transferName = job.args[2]
                transferUUID = job.args[3]
                processingDirectory = job.args[4]
                autoProcessSIPDirectory = job.args[5]
                sharedPath = job.args[6]
                sipName = transferName

                tmpSIPDir = os.path.join(processingDirectory, sipName) + "/"
                destSIPDir = os.path.join(autoProcessSIPDirectory,
                                          sipName) + "/"
                archivematicaFunctions.create_structured_directory(
                    tmpSIPDir, manual_normalization=False)

                # If transfer is a reingested AIP, then pass that info to the SIP
                sip_type = 'SIP'
                sip_uuid = None
                transfer = Transfer.objects.get(uuid=transferUUID)
                if transfer.type == 'Archivematica AIP':
                    sip_type = 'AIP-REIN'
                    # Use reingested AIP's UUID as the SIP UUID
                    # Get AIP UUID from reingest METS name
                    job.pyprint(
                        'path', os.path.join(objectsDirectory, '..',
                                             'metadata'), 'listdir',
                        os.listdir(
                            os.path.join(objectsDirectory, '..', 'metadata')))
                    for item in os.listdir(
                            os.path.join(objectsDirectory, '..', 'metadata')):
                        if item.startswith('METS'):
                            sip_uuid = item.replace('METS.',
                                                    '').replace('.xml', '')
                job.pyprint('sip_uuid', sip_uuid)
                job.pyprint('sip_type', sip_type)

                # Find out if any ``Directory`` models were created for the source
                # ``Transfer``. If so, this fact gets recorded in the new ``SIP`` model.
                dir_mdls = Directory.objects.filter(
                    transfer_id=transferUUID,
                    currentlocation__startswith='%transferDirectory%objects')
                diruuids = len(dir_mdls) > 0

                # Create row in SIPs table if one doesn't already exist
                lookup_path = destSIPDir.replace(sharedPath, '%sharedPath%')
                try:
                    sip = SIP.objects.get(currentpath=lookup_path).uuid
                    if diruuids:
                        sip.diruuids = True
                        sip.save()
                except SIP.DoesNotExist:
                    sip_uuid = databaseFunctions.createSIP(lookup_path,
                                                           UUID=sip_uuid,
                                                           sip_type=sip_type,
                                                           diruuids=diruuids,
                                                           printfn=job.pyprint)
                    sip = SIP.objects.get(uuid=sip_uuid)

                # Move the objects to the SIPDir
                for item in os.listdir(objectsDirectory):
                    src_path = os.path.join(objectsDirectory, item)
                    dst_path = os.path.join(tmpSIPDir, "objects", item)
                    # If dst_path already exists and is a directory, shutil.move
                    # will move src_path into it rather than overwriting it;
                    # to avoid incorrectly-nested paths, move src_path's contents
                    # into it instead.
                    if os.path.exists(dst_path) and os.path.isdir(src_path):
                        for subitem in os.listdir(src_path):
                            shutil.move(os.path.join(src_path, subitem),
                                        dst_path)
                    else:
                        shutil.move(src_path, dst_path)

                # Get the ``Directory`` models representing the subdirectories in the
                # objects/ directory. For each subdirectory, confirm it's in the SIP
                # objects/ directory, and update the current location and owning SIP.
                for dir_mdl in dir_mdls:
                    currentPath = databaseFunctions.deUnicode(
                        dir_mdl.currentlocation)
                    currentSIPDirPath = currentPath.replace(
                        "%transferDirectory%", tmpSIPDir)
                    if os.path.isdir(currentSIPDirPath):
                        dir_mdl.currentlocation = currentPath.replace(
                            "%transferDirectory%", "%SIPDirectory%")
                        dir_mdl.sip = sip
                        dir_mdl.save()
                    else:
                        job.pyprint("directory not found: ",
                                    currentSIPDirPath,
                                    file=sys.stderr)

                # Get the database list of files in the objects directory.
                # For each file, confirm it's in the SIP objects directory, and update the
                # current location/ owning SIP'
                files = File.objects.filter(
                    transfer_id=transferUUID,
                    currentlocation__startswith='%transferDirectory%objects',
                    removedtime__isnull=True)
                for f in files:
                    currentPath = databaseFunctions.deUnicode(
                        f.currentlocation)
                    currentSIPFilePath = currentPath.replace(
                        "%transferDirectory%", tmpSIPDir)
                    if os.path.isfile(currentSIPFilePath):
                        f.currentlocation = currentPath.replace(
                            "%transferDirectory%", "%SIPDirectory%")
                        f.sip = sip
                        f.save()
                    else:
                        job.pyprint("file not found: ",
                                    currentSIPFilePath,
                                    file=sys.stderr)

                archivematicaFunctions.create_directories(
                    archivematicaFunctions.MANUAL_NORMALIZATION_DIRECTORIES,
                    basepath=tmpSIPDir)

                # Copy the JSON metadata file, if present; this contains a
                # serialized copy of DC metadata entered in the dashboard UI
                # during the transfer.
                src = os.path.normpath(
                    os.path.join(objectsDirectory, "..", "metadata",
                                 "dc.json"))
                dst = os.path.join(tmpSIPDir, "metadata", "dc.json")
                if os.path.exists(src):
                    shutil.copy(src, dst)

                # Copy processingMCP.xml file
                src = os.path.join(os.path.dirname(objectsDirectory[:-1]),
                                   "processingMCP.xml")
                dst = os.path.join(tmpSIPDir, "processingMCP.xml")
                shutil.copy(src, dst)

                # moveSIPTo autoProcessSIPDirectory
                shutil.move(tmpSIPDir, destSIPDir)