예제 #1
0
    def test_create_avalon_dip_success(self):
        """
        Test full DIP creation:
            - AIP download
            - AIP extraction
            - DIP folder creation
            - METS file creation
            - Objects folder creation
            - ZIP file creation
            - Files inside ZIP created with original
            filename and lastmodified date
        """
        with TmpDir(TMP_DIR):
            # Download the AIP first
            aip_path = amclient.AMClient(
                aip_uuid=AVALON_AIP_UUID,
                ss_url=SS_URL,
                ss_user_name=SS_USER_NAME,
                ss_api_key=SS_API_KEY,
                directory=TMP_DIR,
            ).download_aip()
            # Extract it
            aip_dir = create_dip.extract_aip(aip_path, AVALON_AIP_UUID, TMP_DIR)
            # Test DIP creation
            avalon_dip_dir = create_dip.create_dip(
                aip_dir, AVALON_AIP_UUID, OUTPUT_DIR, "atom", "avalon-manifest"
            )
            # Check DIP structure
            assert avalon_dip_dir == "{}/{}/{}".format(
                OUTPUT_DIR, TRANSFER_NAME, AVALON_AIP_UUID
            )
            assert os.path.isdir(avalon_dip_dir)

            # Check that CSV and folder are present, and METS file is removed
            assert sorted(os.listdir(avalon_dip_dir)) == sorted(
                ["Demo_Manifest.csv", "assets"]
            )

            # Check contents of CSV have been updated
            csv_path = "{}/Demo_Manifest.csv".format(avalon_dip_dir)
            is_in_file = False
            with open(csv_path, "rt") as c:
                demo_manifest = csv.reader(c, delimiter=",")
                for row in demo_manifest:
                    if AVALON_AIP_UUID in row:
                        is_in_file = True
            assert is_in_file

            # Check that files are present
            avalon_files = os.listdir("{}/assets".format(avalon_dip_dir))
            assets = [
                "agz3068a.wav",
                "lunchroom_manners_512kb.mp4",
                "lunchroom_manners_512kb.mp4.structure.xml",
                "lunchroom_manners_512kb.mp4.vtt",
                "OrganClip.high.mp4",
                "OrganClip.low.mp4",
                "OrganClip.medium.mp4",
            ]
            assert sorted(assets) == sorted(avalon_files)
예제 #2
0
import sys
import os
import time
import settings
import amclient
import requests
import logging
import shutil

# Initialize Archivematica Python Client
am = amclient.AMClient()

# Import AM API info from settings.py
am.am_url = settings.am_url
am.am_api_key = settings.am_api_key
am.am_user_name = settings.am_user_name

am.ss_url = settings.ss_url
am.ss_api_key = settings.ss_api_key
am.ss_user_name = settings.ss_user_name

# Set institution code from user input
institution = sys.argv[1]
# institution = settings.INSTITUTION[institution]

# Import institutional variables from settings.py
am.transfer_source = settings.INSTITUTION[institution]['transfer_source']
am.transfer_type = settings.INSTITUTION[institution]['transfer_type']
am.processing_config = settings.INSTITUTION[institution]['processing_config']

transfer_folder = '/' + institution + 'islandora/transfer/'  # this is the directory to be watched
예제 #3
0
def main(
    ss_url,
    ss_user,
    ss_api_key,
    aip_uuid,
    tmp_dir,
    output_dir,
    mets_type="atom",
    dip_type="zipped-objects",
):
    LOGGER.info("Starting DIP creation from AIP: %s", aip_uuid)

    if not os.path.isdir(tmp_dir):
        LOGGER.error("%s is not a valid temporary directory", tmp_dir)
        return 1

    if not os.path.isdir(output_dir):
        LOGGER.error("%s is not a valid output directory", output_dir)
        return 2

    # Create empty workspace directory
    tmp_dir = os.path.join(tmp_dir, aip_uuid)
    if os.path.exists(tmp_dir):
        LOGGER.warning("Workspace directory already exists, overwriting")
        shutil.rmtree(tmp_dir)
    try:
        os.makedirs(tmp_dir)
    except OSError:
        LOGGER.error("Could not create workspace directory: %s", tmp_dir)
        return 3

    LOGGER.info("Downloading AIP from Storage Service")

    am_client = amclient.AMClient(
        aip_uuid=aip_uuid,
        ss_url=ss_url,
        ss_user_name=ss_user,
        ss_api_key=ss_api_key,
        directory=tmp_dir,
    )

    aip_file = am_client.download_aip()

    if not aip_file:
        LOGGER.error("Unable to download AIP")
        return 4

    LOGGER.info("Extracting AIP")
    aip_dir = extract_aip(aip_file, aip_uuid, tmp_dir)

    if not aip_dir:
        return 5

    LOGGER.info("Creating DIP")
    dip_dir = create_dip(aip_dir, aip_uuid, output_dir, mets_type, dip_type)

    if not dip_dir:
        LOGGER.error("Unable to create DIP")
        return 6

    # Remove workspace directory
    shutil.rmtree(tmp_dir)

    LOGGER.info("DIP created in: %s", dip_dir)

    return dip_dir
예제 #4
0
def process_dip(
    ss_url,
    ss_user,
    ss_api_key,
    dip,
    pipeline_uuid,
    processing_uuid,
    s3_uuid,
    shared_directory,
    dip_path,
):
    # remove ContentDm file
    contentdm = os.path.join(shared_directory, dip_path, dip,
                             "objects/compound.txt")
    try:
        os.remove(contentdm)
    except Exception as e:
        LOGGER.warning("Unable to remove contentDM file: %s", e)

    # Get AIP UUID
    aip_uuid = dip[-36:]

    # USE AM Client to get info on the AIP
    am_client = amclient.AMClient(
        package_uuid=aip_uuid,
        ss_url=ss_url,
        ss_user_name=ss_user,
        ss_api_key=ss_api_key,
    )
    try:
        aip_details = am_client.get_package_details()
    except Exception as e:
        LOGGER.error("Unable to locate valid AIP package: %s", e)
        return 2

    # Get file list in DIP
    object_list = []
    thumbnail_list = []
    object_path = os.path.join(shared_directory, dip_path, dip, "objects")
    for root, _, files in os.walk(object_path):
        for name in files:
            rel_dir = os.path.relpath(
                root, os.path.join(shared_directory, dip_path, dip))
            object_list.append(os.path.join(rel_dir, name))
    if not object_list:
        LOGGER.error("Unable to find any access files in the DIP.")
        return 2

    thumbnail_path = os.path.join(shared_directory, dip_path, dip,
                                  "thumbnails")
    for root, _, files in os.walk(thumbnail_path):
        for name in files:
            rel_dir = os.path.relpath(
                root, os.path.join(shared_directory, dip_path, dip))
            thumbnail_list.append(os.path.join(rel_dir, name))

    # get mets file
    mets_name = "METS." + aip_uuid + ".xml"
    try:
        mets = metsrw.METSDocument.fromfile(
            os.path.join(shared_directory, dip_path, dip, mets_name))
    except Exception as e:
        LOGGER.error("Unable to extract and load METS file: %s", e)
        return 2

    # Compile data for upload to S3
    size = 0
    for dirpath, _, filenames in os.walk(
            os.path.join(shared_directory, dip_path, dip)):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            size += os.path.getsize(file_path)
    dip_data = {
        "origin_pipeline": "/api/v2/pipeline/" + pipeline_uuid + "/",
        "origin_location": "/api/v2/location/" + processing_uuid + "/",
        "origin_path": os.path.join(dip_path, dip) + "/",
        "current_location": "/api/v2/location/" + s3_uuid + "/",
        "current_path": dip,
        "package_type": "DIP",
        "aip_subtype": "Archival Information Package",  # same as in AM
        "size": size,
        "related_package_uuid": aip_uuid,
    }
    LOGGER.info("Storing DIP in S3 location.")
    url = ss_url + "/api/v2/file/"
    headers = {"Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""}
    response = requests.post(url,
                             headers=headers,
                             json=dip_data,
                             timeout=86400)
    if response.status_code != requests.codes.created:
        LOGGER.error("Could not store DIP in S3 location: %s", response.text)
        return 2
    else:
        LOGGER.info("DIP stored in S3 location.")
        ret = response.json()
        if "uuid" in ret:
            dip_uuid = ret["uuid"]
            LOGGER.info("Storage Service DIP UUID: %s" % ret["uuid"])
        else:
            LOGGER.error("Storage Service didn't return the DIP UUID")
            return 2

    # USE AM Client to get info on the DIP
    LOGGER.info("Compiling DIP info.")
    am_client = amclient.AMClient(
        package_uuid=dip_uuid,
        ss_url=ss_url,
        ss_user_name=ss_user,
        ss_api_key=ss_api_key,
    )
    dip_details = am_client.get_package_details()

    dip_info = {}
    dip_info["dip-uuid"] = dip_details["uuid"]
    dip_info["dip-path"] = dip_details["current_full_path"]
    # get bucket and region
    location_url = ss_url + dip_details["current_location"]
    headers = {"Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""}
    location_response = requests.get(location_url,
                                     headers=headers,
                                     timeout=86400)
    space_url = ss_url + location_response.json()['space']
    space_response = requests.get(space_url, headers=headers, timeout=86400)
    dip_info["dip-bucket"] = space_response.json().get('bucket', "")
    dip_info["dip-region"] = space_response.json().get('region', "")

    dip_info["object-list"] = object_list
    dip_info["thumbnail-list"] = thumbnail_list
    dip_info["aip-uuid"] = aip_uuid

    # get related AIP package info
    dip_info["aip-path"] = aip_details["current_full_path"]
    # get bucket and region
    location_url = ss_url + aip_details["current_location"]
    headers = {"Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""}
    location_response = requests.get(location_url,
                                     headers=headers,
                                     timeout=86400)
    space_url = ss_url + location_response.json()['space']
    space_response = requests.get(space_url, headers=headers, timeout=86400)
    dip_info["aip-bucket"] = space_response.json().get('bucket', "")
    dip_info["aip-region"] = space_response.json().get('region', "")
    # GET REPLICATED AIP PACKAGE INFO
    if aip_details["replicas"]:
        replica_uuid = os.path.basename(aip_details["replicas"][0][:-1])
        am_client = amclient.AMClient(
            package_uuid=replica_uuid,
            ss_url=ss_url,
            ss_user_name=ss_user,
            ss_api_key=ss_api_key,
        )
        replica_details = am_client.get_package_details()
        dip_info["replica-uuid"] = replica_uuid
        dip_info["replica-path"] = replica_details["current_full_path"]
        # get bucket and region
        location_url = ss_url + replica_details["current_location"]
        headers = {
            "Authorization": "ApiKey " + ss_user + ":" + ss_api_key + ""
        }
        location_response = requests.get(location_url,
                                         headers=headers,
                                         timeout=86400)
        space_url = ss_url + location_response.json()['space']
        space_response = requests.get(space_url,
                                      headers=headers,
                                      timeout=86400)
        dip_info["replica-bucket"] = space_response.json().get('bucket', "")
        dip_info["replica-region"] = space_response.json().get('region', "")
    else:
        dip_info["replica-uuid"] = ""
        dip_info["replica-bucket"] = ""
        dip_info["replica-region"] = ""

    # Return the data
    return dip_info, mets
예제 #5
0
def main(
    ss_url,
    ss_user,
    ss_api_key,
    location_uuid,
    tmp_dir,
    output_dir,
    database_file,
    delete_local_copy,
    upload_type,
    pipeline_uuid,
    cp_location_uuid,
    ds_location_uuid,
    shared_directory,
    atom_url,
    atom_email,
    atom_password,
    atom_slug,
    rsync_target,
):
    LOGGER.info("Processing AIPs in SS location: %s", location_uuid)

    # Idempotently create database and Aip table and create session
    try:
        session = models.init(database_file)
    except IOError:
        LOGGER.error("Could not create database in: %s", database_file)
        return 1

    # Get UPLOADED and VERIFIED AIPs from the SS
    try:
        am_client = amclient.AMClient(
            ss_url=ss_url, ss_user_name=ss_user, ss_api_key=ss_api_key
        )
        # There is an issue in the SS API that avoids
        # filtering the results by location. See:
        # https://github.com/artefactual/archivematica-storage-service/issues/298
        aips = am_client.aips({"status__in": "UPLOADED,VERIFIED"})
    except Exception as e:
        LOGGER.error(e)
        return 2

    # Get only AIPs from the specified location
    aip_uuids = filter_aips(aips, location_uuid)

    # Create DIPs for those AIPs
    for uuid in aip_uuids:
        try:
            # To avoid race conditions while checking for an existing AIP
            # and saving it, create the row directly and check for an
            # integrity error exception (the uuid is a unique column)
            db_aip = models.Aip(uuid=uuid)
            session.add(db_aip)
            session.commit()
        except exc.IntegrityError:
            session.rollback()
            LOGGER.debug("Skipping AIP (already processed/processing): %s", uuid)
            continue

        mets_type = "atom"
        if upload_type == "ss-upload":
            mets_type = "storage-service"

        dip_path = create_dip.main(
            ss_url=ss_url,
            ss_user=ss_user,
            ss_api_key=ss_api_key,
            aip_uuid=uuid,
            tmp_dir=tmp_dir,
            output_dir=output_dir,
            mets_type=mets_type,
        )

        # Do not try upload on creation error
        if type(dip_path) == int:
            LOGGER.error("Could not create DIP from AIP: %s", uuid)
            continue

        if upload_type == "ss-upload":
            storage_service_upload.main(
                ss_url=ss_url,
                ss_user=ss_user,
                ss_api_key=ss_api_key,
                pipeline_uuid=pipeline_uuid,
                cp_location_uuid=cp_location_uuid,
                ds_location_uuid=ds_location_uuid,
                shared_directory=shared_directory,
                dip_path=dip_path,
                aip_uuid=uuid,
                delete_local_copy=delete_local_copy,
            )
        elif upload_type == "atom-upload":
            atom_upload.main(
                atom_url=atom_url,
                atom_email=atom_email,
                atom_password=atom_password,
                atom_slug=atom_slug,
                rsync_target=rsync_target,
                dip_path=dip_path,
                delete_local_copy=delete_local_copy,
            )

    LOGGER.info("All AIPs have been processed")