Ejemplo n.º 1
0
def lookup_bag(space, external_identifier, version):
    logger.debug("Looking up bag %s/%s", space, external_identifier)

    api_variants = {"stage": "api-stage", "prod": "api"}

    for name, host in api_variants.items():
        logging.debug("Checking %s API", name)

        api_url = f"https://{host}.wellcomecollection.org/storage/v1"
        client = get_storage_client(api_url)

        try:
            ingest = client.get_bag(space, external_identifier, version)
        except BagNotFound:
            logging.debug("Not found in %s API", name)
        else:
            logging.debug("Found bag in %s API:", name)
            return ingest

    logging.error("Could not find %s/%s in either API!", space, external_identifier)
    sys.exit(1)
Ejemplo n.º 2
0
def lookup_ingest(ingest_id):
    logger.debug("Looking up ingest ID %s", ingest_id)

    api_variants = {"stage": "api-stage", "prod": "api"}

    for name, host in api_variants.items():
        logger.debug("Checking %s API", name)

        api_url = f"https://{host}.wellcomecollection.org/storage/v1"
        client = get_storage_client(api_url)

        try:
            ingest = client.get_ingest(ingest_id)
        except IngestNotFound:
            logger.debug("Not found in %s API", name)
        else:
            logger.debug("Found ingest in %s API:", name)
            return ingest

    logger.error("Could not find %s in either API!", ingest_id)
    sys.exit(1)
    missing_objects = object_refs - set(bag_files.keys())
    if missing_objects:
        abort(f"METS file {mets_name} refers to objects that are missing "
              f"from the bag: {', '.join(sorted(missing_objects))}")

    info(f"{mets_name}: All ALTO and object references are correct")


if __name__ == "__main__":
    try:
        b_number = sys.argv[1]
    except IndexError:
        sys.exit(f"Usage: {__file__} <B_NUMBER>")

    client = get_storage_client(
        api_url="https://api.wellcomecollection.org/storage/v1")

    bag = client.get_bag(space_id="digitised", source_id=b_number)
    info("Retrieved storage manifest for %s %s from the API" %
         (b_number, bag["version"]))

    bag_files = {
        f["name"][len("data/"):]: f
        for f in bag["manifest"]["files"] if f["name"].startswith("data/")
    }

    root_mets_name = f"{b_number}.xml"

    # Find the METS file in the bag.  This should always be in the top level
    # of the data directory, named after the bnumber,
    #
Ejemplo n.º 4
0
def clone_bag(api_name, space, external_identifier):
    api_url = {
        "prod": "https://api.wellcomecollection.org/storage/v1",
        "staging": "https://api-stage.wellcomecollection.org/storage/v1",
    }[api_name]

    client = get_storage_client(api_url)

    bag = client.get_bag(space_id=space, source_id=external_identifier)

    os.makedirs("_bags", exist_ok=True)

    bag_slug = f"{space}_{slugify(external_identifier)}"

    if api_name == "staging":
        bag_slug = f"staging_{bag_slug}"

    bag_dir = os.path.join("_bags", bag_slug)

    try:
        os.makedirs(bag_dir)
    except FileExistsError:
        sys.exit("You already have a clone of this bag!")

    os.makedirs(os.path.join(bag_dir, "data"))

    location = bag["location"]
    bucket = location["bucket"]
    path_prefix = location["path"]

    with open(os.path.join(bag_dir, "fetch.txt"), "w") as fetch_file:
        for manifest_file in sorted(bag["manifest"]["files"],
                                    key=lambda f: f["name"]):
            path = manifest_file["path"]
            size = manifest_file["size"]
            name = manifest_file["name"]

            fetch_file.write(
                f"s3://{bucket}/{path_prefix}/{path}\t{size}\t{name}\n")

    # Download some of the manifest files from the original bag.
    s3 = get_read_only_aws_resource("s3")

    for tag_manifest_file in bag["tagManifest"]["files"]:
        path = tag_manifest_file["path"]
        name = tag_manifest_file["name"]

        if name.startswith("tagmanifest-"):
            continue

        s3.Bucket(bucket).download_file(Key=f"{path_prefix}/{path}",
                                        Filename=os.path.join(bag_dir, name))

    bag = bagit.Bag(bag_dir)
    bag.save(manifests=True)

    print(f"✨ Created your new bag at {bag_dir} ✨")
    print("")
    print("You can use the bagit-python tool to validate your new bag:")
    print("")
    print(f"$ bagit.py --validate {bag_dir}")
    print("")
Ejemplo n.º 5
0
logger = get_logger(__name__)

if __name__ == "__main__":
    try:
        ingest_id = sys.argv[1]
    except IndexError:
        sys.exit(f"Usage: {__file__} <INGEST_ID>")

    ingest = lookup_ingest(ingest_id)

    if "api-stage" in ingest["@context"]:
        api_url = "https://api-stage.wellcomecollection.org/storage/v1"
    else:
        api_url = "https://api.wellcomecollection.org/storage/v1"

    client = get_storage_client(api_url)

    location = client.create_s3_ingest(
        space_id=ingest["space"]["id"],
        s3_bucket=ingest["sourceLocation"]["bucket"],
        s3_key=ingest["sourceLocation"]["path"],
        external_identifier=ingest["bag"]["info"]["externalIdentifier"],
    )

    logger.info("Ingest created at URL %s", location)
    logger.info("Ingest has ID %s", location.split("/")[-1])
    logger.info(
        "To look up the ingest:\n\n\tpython ss_get_ingest.py %s",
        location.split("/")[-1],
    )