Esempio n. 1
0
def index_aip(job):
    """Write AIP information to ElasticSearch. """
    sip_uuid = job.args[1]  # %SIPUUID%
    sip_name = job.args[2]  # %SIPName%
    sip_staging_path = job.args[3]  # %SIPDirectory%
    sip_type = job.args[4]  # %SIPType%
    if "aips" not in mcpclient_settings.SEARCH_ENABLED:
        logger.info("Skipping indexing: AIPs indexing is currently disabled.")
        return 0
    elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings)
    client = elasticSearchFunctions.get_client()
    aip_info = storage_service.get_file_info(uuid=sip_uuid)
    job.pyprint("AIP info:", aip_info)
    aip_info = aip_info[0]
    mets_staging_path = os.path.join(sip_staging_path,
                                     "METS.{}.xml".format(sip_uuid))
    identifiers = get_identifiers(job, sip_staging_path)
    # If this is an AIC, find the number of AIP stored in it and index that
    aips_in_aic = None
    if sip_type == "AIC":
        try:
            uv = UnitVariable.objects.get(unittype="SIP",
                                          unituuid=sip_uuid,
                                          variable="AIPsinAIC")
            aips_in_aic = uv.variablevalue
        except UnitVariable.DoesNotExist:
            pass
    # Delete ES index before creating new one if reingesting
    if "REIN" in sip_type:
        job.pyprint(
            "Deleting outdated entry for AIP and AIP files with UUID",
            sip_uuid,
            "from archival storage",
        )
        elasticSearchFunctions.delete_aip(client, sip_uuid)
        elasticSearchFunctions.delete_aip_files(client, sip_uuid)
    job.pyprint("Indexing AIP and AIP files")
    # Even though we treat MODS identifiers as SIP-level, we need to index them
    # here because the archival storage tab actually searches on the
    # aips/aipfile index.
    ret = elasticSearchFunctions.index_aip_and_files(
        client=client,
        uuid=sip_uuid,
        aip_stored_path=aip_info["current_full_path"],
        mets_staging_path=mets_staging_path,
        name=sip_name,
        aip_size=aip_info["size"],
        aips_in_aic=aips_in_aic,
        identifiers=identifiers,
        encrypted=aip_info["encrypted"],
        printfn=job.pyprint,
    )
    if ret == 1:
        job.pyprint("Error indexing AIP and AIP files", file=sys.stderr)
    return ret
Esempio n. 2
0
def processAIPThenDeleteMETSFile(path,
                                 temp_dir,
                                 es_client,
                                 delete_existing_data=False):
    archive_file = os.path.basename(path)

    # Regex match the UUID - AIP might end with .7z, .tar.bz2, or
    # something else.
    match = re.search(
        r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
        archive_file)
    if match is not None:
        aip_uuid = match.group()
    else:
        return -1

    print("Processing AIP", aip_uuid)

    if delete_existing_data is True:
        print("Deleting AIP", aip_uuid, "from aips/aip and aips/aipfile.")
        elasticSearchFunctions.delete_aip(es_client, aip_uuid)
        elasticSearchFunctions.delete_aip_files(es_client, aip_uuid)

    # AIP filenames are <name>-<uuid><extension>
    # Index of match end is right before the extension
    subdir = archive_file[:match.end()]
    aip_name = subdir[:-37]
    mets_file = "METS." + aip_uuid + ".xml"
    mets_file_relative_path = os.path.join("data", mets_file)
    if os.path.isfile(path):
        mets_file_relative_path = os.path.join(subdir, mets_file_relative_path)
    path_to_mets = extract_file(
        archive_path=path,
        destination_dir=temp_dir,
        relative_path=mets_file_relative_path,
    )

    # If AIC, need to extract number of AIPs in AIC to index as well
    aips_in_aic = None
    root = etree.parse(path_to_mets)
    try:
        aip_type = ns.xml_find_premis(
            root,
            "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:type"
        ).text
    except AttributeError:
        pass
    else:
        if aip_type == "Archival Information Collection":
            aips_in_aic = get_aips_in_aic(root, path, temp_dir)

    aip_info = storage_service.get_file_info(uuid=aip_uuid)

    if not aip_info:
        print("Information not found in Storage Service for AIP UUID: ",
              aip_uuid)
        return 1

    return elasticSearchFunctions.index_aip_and_files(
        client=es_client,
        uuid=aip_uuid,
        aip_stored_path=path,
        mets_staging_path=path_to_mets,
        name=aip_name,
        aip_size=aip_info[0]["size"],
        aips_in_aic=aips_in_aic,
        identifiers=[],  # TODO get these
    )
Esempio n. 3
0
    def process_package(
        self, es_client, package_info, temp_dir, delete_before_reindexing, is_aic=False
    ):
        """Index package in 'aips' and 'aipfiles' indices.

        :param es_client: Elasticsearch client.
        :param package_info: Package info dict returned by Storage
        Service.
        :param temp_dir: Path to tempdir for downloaded METS files.
        :param delete_before_reindexing: Boolean of whether to delete
        package from indices prior to reindexing.
        :is_aic: Optional boolean to indicate if package being indexed
        is an AIC.

        :returns: Boolean indicating success.
        """
        uuid = package_info["uuid"]

        # Download the AIP METS file to a temporary directory.
        mets_relative_path = am.relative_path_to_aip_mets_file(
            package_info["uuid"], package_info["current_path"]
        )
        mets_filename = os.path.basename(mets_relative_path)
        mets_download_path = os.path.join(temp_dir, mets_filename)
        storageService.extract_file(uuid, mets_relative_path, mets_download_path)

        if not os.path.isfile(mets_download_path):
            error_message = "Unable to download AIP METS file from Storage Service"
            self.error(
                "Error indexing package {0}. Details: {1}".format(uuid, error_message)
            )
            return False

        aips_in_aic = None
        if is_aic:
            mets_root = etree.parse(mets_download_path)
            aips_in_aic = get_aips_in_aic(mets_root, temp_dir, uuid)

        package_name = am.package_name_from_path(
            package_info["current_path"], remove_uuid_suffix=True
        )

        aip_location = package_info.get("current_location", "")
        location_description = storageService.retrieve_storage_location_description(
            aip_location
        )

        if delete_before_reindexing:
            self.info(
                "Deleting package {} from 'aips' and 'aipfiles' indices.".format(uuid)
            )
            es.delete_aip(es_client, uuid)
            es.delete_aip_files(es_client, uuid)

        # Index the AIP and then immediately delete the METS file.
        try:
            es.index_aip_and_files(
                client=es_client,
                uuid=uuid,
                aip_stored_path=package_info["current_full_path"],
                mets_staging_path=mets_download_path,
                name=package_name,
                aip_size=package_info["size"],
                aips_in_aic=aips_in_aic,
                encrypted=package_info.get("encrypted", False),
                location=location_description,
            )
            self.info("Successfully indexed package {}".format(uuid))
            os.remove(mets_download_path)
            return True
        except (ElasticsearchException, etree.XMLSyntaxError) as err:
            self.error("Error indexing package {0}. Details: {1}".format(uuid, err))
            os.remove(mets_download_path)
            return False