def lookup_bag(space, external_identifier, version): logger.debug("Looking up bag %s/%s", space, external_identifier) api_variants = {"stage": "api-stage", "prod": "api"} for name, host in api_variants.items(): logging.debug("Checking %s API", name) api_url = f"https://{host}.wellcomecollection.org/storage/v1" client = get_storage_client(api_url) try: ingest = client.get_bag(space, external_identifier, version) except BagNotFound: logging.debug("Not found in %s API", name) else: logging.debug("Found bag in %s API:", name) return ingest logging.error("Could not find %s/%s in either API!", space, external_identifier) sys.exit(1)
def lookup_ingest(ingest_id): logger.debug("Looking up ingest ID %s", ingest_id) api_variants = {"stage": "api-stage", "prod": "api"} for name, host in api_variants.items(): logger.debug("Checking %s API", name) api_url = f"https://{host}.wellcomecollection.org/storage/v1" client = get_storage_client(api_url) try: ingest = client.get_ingest(ingest_id) except IngestNotFound: logger.debug("Not found in %s API", name) else: logger.debug("Found ingest in %s API:", name) return ingest logger.error("Could not find %s in either API!", ingest_id) sys.exit(1)
missing_objects = object_refs - set(bag_files.keys()) if missing_objects: abort(f"METS file {mets_name} refers to objects that are missing " f"from the bag: {', '.join(sorted(missing_objects))}") info(f"{mets_name}: All ALTO and object references are correct") if __name__ == "__main__": try: b_number = sys.argv[1] except IndexError: sys.exit(f"Usage: {__file__} <B_NUMBER>") client = get_storage_client( api_url="https://api.wellcomecollection.org/storage/v1") bag = client.get_bag(space_id="digitised", source_id=b_number) info("Retrieved storage manifest for %s %s from the API" % (b_number, bag["version"])) bag_files = { f["name"][len("data/"):]: f for f in bag["manifest"]["files"] if f["name"].startswith("data/") } root_mets_name = f"{b_number}.xml" # Find the METS file in the bag. This should always be in the top level # of the data directory, named after the bnumber, #
def clone_bag(api_name, space, external_identifier): api_url = { "prod": "https://api.wellcomecollection.org/storage/v1", "staging": "https://api-stage.wellcomecollection.org/storage/v1", }[api_name] client = get_storage_client(api_url) bag = client.get_bag(space_id=space, source_id=external_identifier) os.makedirs("_bags", exist_ok=True) bag_slug = f"{space}_{slugify(external_identifier)}" if api_name == "staging": bag_slug = f"staging_{bag_slug}" bag_dir = os.path.join("_bags", bag_slug) try: os.makedirs(bag_dir) except FileExistsError: sys.exit("You already have a clone of this bag!") os.makedirs(os.path.join(bag_dir, "data")) location = bag["location"] bucket = location["bucket"] path_prefix = location["path"] with open(os.path.join(bag_dir, "fetch.txt"), "w") as fetch_file: for manifest_file in sorted(bag["manifest"]["files"], key=lambda f: f["name"]): path = manifest_file["path"] size = manifest_file["size"] name = manifest_file["name"] fetch_file.write( f"s3://{bucket}/{path_prefix}/{path}\t{size}\t{name}\n") # Download some of the manifest files from the original bag. s3 = get_read_only_aws_resource("s3") for tag_manifest_file in bag["tagManifest"]["files"]: path = tag_manifest_file["path"] name = tag_manifest_file["name"] if name.startswith("tagmanifest-"): continue s3.Bucket(bucket).download_file(Key=f"{path_prefix}/{path}", Filename=os.path.join(bag_dir, name)) bag = bagit.Bag(bag_dir) bag.save(manifests=True) print(f"✨ Created your new bag at {bag_dir} ✨") print("") print("You can use the bagit-python tool to validate your new bag:") print("") print(f"$ bagit.py --validate {bag_dir}") print("")
logger = get_logger(__name__) if __name__ == "__main__": try: ingest_id = sys.argv[1] except IndexError: sys.exit(f"Usage: {__file__} <INGEST_ID>") ingest = lookup_ingest(ingest_id) if "api-stage" in ingest["@context"]: api_url = "https://api-stage.wellcomecollection.org/storage/v1" else: api_url = "https://api.wellcomecollection.org/storage/v1" client = get_storage_client(api_url) location = client.create_s3_ingest( space_id=ingest["space"]["id"], s3_bucket=ingest["sourceLocation"]["bucket"], s3_key=ingest["sourceLocation"]["path"], external_identifier=ingest["bag"]["info"]["externalIdentifier"], ) logger.info("Ingest created at URL %s", location) logger.info("Ingest has ID %s", location.split("/")[-1]) logger.info( "To look up the ingest:\n\n\tpython ss_get_ingest.py %s", location.split("/")[-1], )