class ASpace(): # this happens when you call ASpace() def __init__(self, **config): # Connect to ASpace using .archivessnake.yml self.client = ASnakeClient(**config) self.client.authorize() m = re.match(r'\(v?(.+\))', self.client.get('version').text) if m: self.version = m[1] else: self.version = 'unknown version' def __getattr__(self, attr): '''returns the JSONModelRelation representing the route with the same name as the attribute requested.''' if not attr.startswith('_'): return JSONModelRelation("/{}".format(attr), params={"all_ids": True}, client = self.client) @property def resources(self): '''return all resources from every repo.''' return ResourceRelation({}, self.client) @property def agents(self): '''returns an AgentRelation.''' return AgentRelation("/agents", {}, self.client) @property def users(self): '''returns a UserRelation.''' return UserRelation("/users", {}, self.client) def by_external_id(self, external_id, record_types=None): '''return any resources fetched from the 'by-external-id' route. Note: while the route will return differently depending on how many records are returned, this method deliberately flattens that out - it will _always_ return a generator, even if only one record is found.''' params = {"eid": external_id} if record_types: params['type[]'] = record_types res = self.client.get('by-external-id', params=params) if res.status_code == 404: return [] elif res.status_code == 300: # multiple returns, bare list of uris yield from (wrap_json_object({"ref": uri}, self.client) for uri in IndexedSet(res.json())) elif res.status_code == 200: # single obj, redirects to obj with 303->200 yield wrap_json_object(res.json(), self.client) else: raise ASnakeBadReturnCode("by-external-id call returned '{}'".format(res.status_code)) def from_uri(self, uri): '''returns a JSONModelObject representing the URI passed in''' return wrap_json_object(self.client.get(uri).json(), self.client)
def main(): client = ASnakeClient(baseurl='XXXX', username='******', password='******') client.authorize() changes = { 'linear_feet': ['Linear Feet', 'linear ft.', 'Linear Foot'], 'cubic_feet': ['Cubic Feet'], 'gigabytes': ['Gigabytes'] } res_records = (client.get('repositories/2/resources', params={'all_ids': True})).json() found_records = set([]) for record in tqdm(res_records): rec_uri = 'repositories/2/resources/{0}'.format(record) res_record = client.get(rec_uri).json() updated_record = deepcopy(res_record) try: extents = res_record['extents'] for ext_index, extent in enumerate(extents): for key, value in changes.items(): if extent['extent_type'] in value: updated_record['extents'][ext_index][ 'extent_type'] = key break else: pass if res_record['extents'] != updated_record['extents']: response = client.post(rec_uri, json=updated_record) if response.status_code == 200: logger.info('Extent change successfully pushed', rec=record, response=response) found_records.add(record) else: logger.info('Extent change failed', rec=record, response=response) else: pass except: pass print('{0} resource records checked; {1} records updated.'.format( len(res_records), len(found_records)))
def test_authorize(): client = ASnakeClient() # relies on default config, see ASnakeConfig class toke = client.authorize() assert isinstance(toke, str) assert len(toke) == 64 assert set(toke) <= set('0123456789abcdef') assert client.session.headers['X-ArchivesSpace-Session'] == toke # Try to get admin user info, should only work if we're authed as admin assert client.get('users/1').status_code == 200
from asnake.client import ASnakeClient from secrets import * # as_repo = input("Enter ArchivesSpace repository #: ") client = ASnakeClient(baseurl=as_api, username=as_username, password=as_password) client.authorize() # Dublin Core XML # do_dc = client.get("/repositories/2/digital_objects/dublin_core/2679.xml") # # print(do.content) # with open("do_dublincore.xml", "wb") as file: # file.write(do_dc.content) # file.close() # Dublin Core FMT do_dc_fmt = client.get("/repositories/2/digital_objects/dublin_core/2679.fmt/metadata") print(do_dc_fmt.content) # with open("do_dc_fmt.json", "wb") as file: # file.write(do_dc_fmt.content) # file.close() # METS XML # mets_xml = client.get("/repositories/2/digital_objects/mets/2679.xml", params={"dmd": "PKG410P"}) # # print(mets_xml.content) # with open("do_mets.xml", "wb") as file: # file.write(mets_xml.content) # file.close() # # # METS FMT mets_fmt = client.get('/repositories/2/digital_objects/mets/2697.fmt/metadata') print(mets_fmt.content)
import asnake.logging as logging logging.setup_logging(level='DEBUG', filename="remove_fake_wrapper.log", filemode="a") aspace = ASpace(baseurl="[ASPACE API URL]", username="******", password="******") #Log Into ASpace and set repo to RL aspace_client = ASnakeClient(baseurl="[ASPACE API URL]", username="******", password="******") aspace_client.authorize() #Set target repo repo = aspace_client.get("repositories/2").json() print("Logged into: " + repo['name']) rl_repo = aspace.repositories(2) #input is output of SQL query above input_csv = input("Path to CSV Input: ") #output will be input CSV plus some extra columns for reporting on actions taken, errors, etc. updated_resources_csv = input("Path to CSV Output: ") #Test if more than one direct child of Resource Object #Why? Don't want to assign all children to Resource if there are other sibling Components of the fake wrapper component def only_one_direct_child_of_resource_test(resource_object): print("Checking for multiple top-level AOs...") resource_object = rl_repo.resources(row[0])
#/usr/bin/python3 #~/anaconda3/bin/python from asnake.client import ASnakeClient import asnake.logging as logging logging.setup_logging(filename="date_update.log", filemode="a") logger = logging.get_logger("date_updating") #Log Into ASpace and set repo to RL aspace_client = ASnakeClient(baseurl="[backendURL]", username="******", password="******") aspace_client.authorize() repo = aspace_client.get("repositories/2").json() print("Logged into: " + repo['name']) print("Getting list of resources...") resources_list = aspace_client.get( "repositories/2/resources?all_ids=true").json() resources_sorted = sorted(resources_list, reverse=True) for resource in resources_sorted: try: resource_json = aspace_client.get("repositories/2/resources/" + str(resource)).json() #print (resource_json) resource_uri = resource_json['uri'] print("updating: " + resource_uri) resource_update = aspace_client.post(resource_json['uri'], json=resource_json)
import io import csv from asnake.client import ASnakeClient from asnake.aspace import ASpace aspace = ASpace(baseurl="[ASPACE BACKEND URL]", username="******", password="******") #Log Into ASpace and set repo to RL aspace_client = ASnakeClient(baseurl="[ASPACE BACKEND URL]", username="******", password="******") aspace_client.authorize() repo = aspace_client.get("repositories/2").json() print("Logged into: " + repo['name']) destination = 'C:/users/nh48/desktop/as_exports_temp/' input_csv = input("Path to CSV Input: ") #output will be input CSV plus some extra columns for reporting on actions taken, errors, etc. updated_records_csv = input("Path to CSV Output: ") #If Resource finding aid status = published, export the EAD for the resource, save to folder def if_published_export_EAD(resource_uri): resource_json = aspace_client.get(resource_uri).json() published_status = resource_json['finding_aid_status'] id_uri_string = resource_json['uri'].replace("resources","resource_descriptions") #set EAD export options: number components and include DAOs
def main(): client = ASnakeClient(baseurl='XXXX', username='******', password='******') client.authorize() catalog = { 'linear': ['linear_feet', 'Linear Feet', 'linear ft.', 'Linear Foot'], 'cubic': ['cubic_feet', 'Cubic Feet'], 'gb': ['gigabytes', 'Gigabytes'] } res_records = (client.get('repositories/2/resources', params={'all_ids': True})).json() data_list = [] print('Compiling resource records from API...') for record in tqdm(res_records): res_record = client.get( 'repositories/2/resources/{0}'.format(record)).json() try: extents = res_record['extents'] for x in extents: if x['extent_type'] == 'megabytes': data_list.append({ 'id': res_record['id_0'], 'amount': str(float(x['number']) / 1000), 'units': 'gigabytes' }) else: data_list.append({ 'id': res_record['id_0'], 'amount': x['number'], 'units': x['extent_type'] }) except: pass linear_ms = 0 linear_ua = 0 gb_ms = 0 gb_ua = 0 cubic_ms = 0 cubic_ua = 0 print('Analyzing extents in resource data...') for entry in data_list: try: if entry['id'].startswith( 'MS') and entry['units'] in catalog['linear']: linear_ms += float(entry['amount']) elif entry['id'].startswith( 'UA') and entry['units'] in catalog['linear']: linear_ua += float(entry['amount']) elif entry['id'].startswith( 'MS') and entry['units'] in catalog['gb']: gb_ms += float(entry['amount']) elif entry['id'].startswith( 'UA') and entry['units'] in catalog['gb']: gb_ua += float(entry['amount']) elif entry['id'].startswith( 'MS') and entry['units'] in catalog['cubic']: cubic_ms += float(entry['amount']) elif entry['id'].startswith( 'UA') and entry['units'] in catalog['cubic']: cubic_ua += float(entry['amount']) else: pass except: exception = input( 'Uh oh, looks like the analysis ran into a snag; most likely, ' 'a unit of extent for {0} ({1}) is not a pure number. Enter ' '\'stop\' to kill the process so you can fix the record. Alternatively, ' 'you can enter \'continue\' to skip this entry and keep the analysis ' 'going.'.format(entry['id'], entry['amount'])) if (exception.lower()).strip() == 'continue': pass elif (exception.lower()).strip() == 'stop': quit() report = { 'MS Linear feet': round(linear_ms, 2), 'UA Linear feet': round(linear_ua, 2), 'Total linear feet': round((linear_ua + linear_ms), 2), 'MS GB': round(gb_ms, 2), 'UA GB': round(gb_ua, 2), 'Total GB': round((gb_ms + gb_ua), 2), 'MS Cubic feet': round(cubic_ms, 2), 'UA Cubic feet': round(cubic_ua, 2), 'Total Cubic feet': round((cubic_ua + cubic_ms), 2) } print('Generating report as JSON...') with open(('extent_calculator_' + (datetime.datetime.today().strftime('%Y-%m-%d')) + '.json'), 'w') as f: json.dump(report, f)
#read existing exported collection data collectionData = [] collectionFile = open(os.path.join(staticData, "collections.csv"), "r", encoding='utf-8') for line in csv.reader(collectionFile, delimiter="|"): collectionData.append(line) collectionFile.close() #read existing exported subject data subjectData = [] subjectFile = open(os.path.join(staticData, "subjects.csv"), "r", encoding='utf-8') for line in csv.reader(subjectFile, delimiter="|"): subjectData.append(line) subjectFile.close print ("\tQuerying ArchivesSpace...") modifiedList = client.get("repositories/2/resources?all_ids=true&modified_since=" + str(startTime)).json() if len(modifiedList) > 0: print ("\tFound " + str(len(modifiedList)) + " new records!") print ("\tArchivesSpace URIs: " + str(modifiedList)) else: print ("\tFound no new records.") for colID in modifiedList: collection = client.get("repositories/2/resources/" + str(colID)).json() if collection["publish"] != True: print ("\t\tSkipping " + collection["title"] + " because it is unpublished") else: print ("\t\tExporting " + collection["title"] + " " + "(" + collection["id_0"] + ")") checkDACS = {} try: normalName = collection["finding_aid_title"]
primary_types = '/(resource|archival_object|accession|digital_object)/' results_file = 'term_audit_results.csv' # Repo list can either be a command line argument or prompted if len(sys.argv) == 2: repos = sys.argv[1] elif len(sys.argv) < 2: repos = input('Enter repository number (e.g., 1): ') else: sys.exit('Run script again with valid repo number(s)') if repos: repos = re.split(r'\D+', repos) repos = list(filter(None, repos)) else: repos = client.get('repositories').json() # Get list of search terms from CSV file with open('search_terms.csv', 'r', newline='') as term_file: reader = csv.DictReader(term_file) search_terms = list(reader) # Loop through ASpace repositories for repo in repos: headers = [] rows = [] if isinstance(repo, str): # For prompted or arg value repo lists repo_no = repo response = client.get(f'repositories/{repo_no}')
from openpyxl import load_workbook from secrets import * from asnake.aspace import ASpace from asnake.client import ASnakeClient aspace = ASpace(baseurl=as_api, username=as_un, password=as_pw) client = ASnakeClient(baseurl=as_api, username=as_un, password=as_pw) client.authorize() resource_id = input("Enter ASpace URI: ") excel_filepath = input("Enter full filepath for spreadsheet: ") wb = load_workbook(excel_filepath) sheet = wb.active for row in sheet.iter_rows(min_row=2, values_only=True): archival_object = client.get(row[0]).json() print("Converting: {} > {} ... ".format( archival_object["instances"][0]["sub_container"]["indicator_2"], row[5]), end='', flush=True) archival_object["instances"][0]["sub_container"]["indicator_2"] = str( row[5]) update_ao = client.post(row[0], json=archival_object) print("Done. Response: {}".format(update_ao.json()))
for key, value in instance["sub_container"].items(): if "indicator_" in key: if "unknown container" == value: print(archival_object) top_container = client.get( instance["sub_container"]["top_container"] ["ref"]).json() write_csv("a", archival_object["uri"], archival_object["title"], archival_object["dates"][0]["expression"], "Box {}".format(top_container["indicator"]), instance["sub_container"]["type_2"], value) cont_count += 1 return cont_count client = ASnakeClient(baseurl=as_api, username=as_un, password=as_pw) client.authorize() ua97_090_uri = "/repositories/5/resources/5071" write_csv("w", "URI", "Title", "Date", "Box Number", "Child Type", "Child Indicator") resource_info = client.get(ua97_090_uri).json() res_tree = client.get(resource_info["tree"]["ref"]).json() if "children" in res_tree.keys(): print(resource_info["title"]) unknowns = check_children(res_tree["children"], 0) print("Total unknown containers = {}".format(str(unknowns))) print("\n") print("-" * 100)
] } }) # it can take some time for the posted DOs to be indexed, so... showed_up_yet = None while not showed_up_yet: aoSearch = list(client.get_paged('search', params={"filter": AOQuery})) if any(aoSearch): showed_up_yet = True else: print("DOs not present in search yet, waiting a second for the indexer to catch up") sleep(1) linked_ao_uri = aoSearch[0]['uri'] # Get and store archival objects from above search aoRecord = client.get(linked_ao_uri).json() # Find existing instances and create new ones from new digital objects exising_instance = aoRecord['instances'][0] new_instance = {"instance_type": "digital_object", "digital_object": {"ref": uri}} # Merge old and new instances instances_new = [] instances_new.append(exising_instance) instances_new.append(new_instance) aoRecord['instances'] = instances_new # Post updated archival objects aoPost = client.post(linked_ao_uri, json=aoRecord).json() print(aoPost) # Save select information to new csv file f.writerow([title, digital_object_id, uri, linked_ao_uri])
advanced_query = json.dumps({ "filter_term": { "field": "collection_uri_u_sstr", "value": "/repositories/2/resources/" + resource_id, "jsonmodel_type":"field_query"} }) results = list(client.get_paged(endpoint, params={'aq': advanced_query})) # populate top_containers with the ids of each top_container in search results top_containers = [] for value in gen_dict_extract('id', results): top_containers.append(value) # GET each top_container listed in top_containers and add to records records = [] for top_container in top_containers: output = client.get(top_container).json() records.append(output) # have user enter container profile id profile_id = input('Enter container profile ID (I am going to enter 9. You can select another value, as long that ID is in your instance of ArchivesSpace.): ') # Add container profile to records and post print ('The following records have been updated in ArchivesSpace:') for record in records: record['container_profile'] = {'ref': '/container_profiles/' + profile_id} jsonLine = record uri = record['uri'] post = client.post(uri, json=jsonLine).json() print(post)
from asnake.client import ASnakeClient from asnake.aspace import ASpace #BaseURL should point to backend (e.g. https://archivesspace.duke.edu/api or https://localhost:8089) aspace = ASpace(baseurl="[baseurl]", username="******", password="******") #Log Into ASpace and set repo to RL aspace_client = ASnakeClient(baseurl="[baseurl]", username="******", password="******") aspace_client.authorize() #set target repo by id repo = aspace_client.get("repositories/2").json() print("Logged into: " + repo['name']) # Prompt for input, a comma separated list of EADID values (e.g. johndoepapers, janedoepapers, johnandjanedoepapers) eadids = input("List of EADIDs: ") # Split comma separated list eadids_list = eadids.split(",") destination = 'C:/users/nh48/desktop/as_exports_temp/' #set EAD export options: number components and include DAOs export_options = '?include_daos=true&numbered_cs=true&include_unpublished=false' #Check if any unpublished nodes in the resource tree and if so, do not publish and export def has_unpublished_nodes():
newAccession["access_restrictions"] = True newAccession["restrictions_apply"] = True elif accessionKey == "use_restrictions_note": newAccession["use_restrictions"] = True newAccession["restrictions_apply"] = True newAccession["accession_date"] = str(datetime.today().strftime('%Y-%m-%d')) year = newAccession["accession_date"].split("-")[0] #login to ASpace client = ASnakeClient() client.authorize() logging.setup_logging(stream=sys.stdout, level='INFO') # Get related resouces call = "repositories/2/search?type[]=resource&page=1&aq={\"query\":{\"field\":\"identifier\", \"value\":\"" + str(args.ID) + "\", \"jsonmodel_type\":\"field_query\"}}" resourceResponse = client.get(call).json() if len(resourceResponse["results"]) < 1: raise Exception("ERROR: Could not find resource with ID: " + str(args.ID)) else: newAccession["related_resources"] = [{"ref": resourceResponse["results"][0]["uri"]}] # get accession id print ("Getting correct accession ID...") yearBegin = datetime.strptime(year + "-01-01 00:00", "%Y-%m-%d %H:%M") yearBeginPosix = str(time.mktime(yearBegin.timetuple())).split(".")[0] yearCall = "repositories/2/accessions?all_ids=true&modified_since=" + yearBeginPosix accessions = client.get(yearCall).json() idList = [] for aID in accessions: entry = client.get("repositories/2/accessions/" + str(aID)).json()
#!/usr/bin/env python from asnake.client import ASnakeClient import pandas as pd import datetime from tqdm import tqdm client = ASnakeClient(baseurl='XXX', username='******', password='******') client.authorize() accession_records = client.get('repositories/2/accessions', params={ 'all_ids': True }).json() unit_column = [] extent_column = [] collection_no_column = [] created_column = [] start = datetime.datetime.strptime('2017-07-01', '%Y-%m-%d') end = datetime.datetime.strptime('2018-07-31', '%Y-%m-%d') for record in tqdm(accession_records): accession_uri = client.get('repositories/2/accessions/' + str(record)).json() create_date = accession_uri['create_time'][0:10] date_parsed = datetime.datetime.strptime(create_date, '%Y-%m-%d') if start <= date_parsed <= end: coll_num = accession_uri['id_0'] extents = accession_uri['extents']
# have user enter resource id resource_id = input('Enter resource ID (in this case, you should enter 1): ') # search for top_containers linked to entered resource id endpoint = '/repositories/2/top_containers/search' advanced_query = json.dumps({ "filter_term": { "field": "collection_uri_u_sstr", "value": "/repositories/2/resources/" + resource_id, "jsonmodel_type": "field_query" } }) # Can't use get_paged because this endpoint returns raw Solr results = client.get(endpoint, params={ 'aq': advanced_query }).json()["response"]["docs"] # populate top_containers with the ids of each top_container in search results top_containers = [] for value in gen_dict_extract('id', results): top_containers.append(value) # GET each top_container listed in top_containers and add to records records = [] for top_container in top_containers: output = client.get(top_container).json() records.append(output) # have user enter container profile id profile_id = input(
parentList = [] itemData = arclight.json() for parent in itemData["response"][ "document"]["parent_ssm"][1:]: parentList.append( parent.split("_")[1]) parents = "|".join(parentList) else: #for new objects not yet indexed in ArcLight if tree is None: from asnake.client import ASnakeClient client = ASnakeClient() client.authorize() ref = client.get( "repositories/2/find_by_id/archival_objects?ref_id[]=" + refID).json() item = client.get( ref["archival_objects"][0] ["ref"]).json() resource = client.get( item["resource"] ["ref"]).json() tree = client.get(resource["tree"] ["ref"]).json() else: ref = client.get( "repositories/2/find_by_id/archival_objects?ref_id[]=" + refID).json() objURI = ref["archival_objects"][0][
from asnake.client import ASnakeClient from secrets import * as_username = input("ArchivesSpace username: "******"ArchivesSpace password: "******"repositories").json() print("Publishing Digital Objects...", end='', flush=True) for repo in repos: digital_object = {} dig_objs_per_repo = [] repo_digital_objects = client.get(repo["uri"] + "/digital_objects?all_ids=true").json() for dig_obj_id in repo_digital_objects: object_request = repo["uri"] + "/digital_objects/" + str(dig_obj_id) + "/publish" try: client.post(object_request) except Exception as e: print("Error found when requesting id: " + str(e) + "\n" + object_request) # digital_object[dig_obj_id] = client.get(repo["uri"] + "/digital_objects/" + str(dig_obj_id)).json() # dig_objs_per_repo.append(digital_object) # repo_dig_objects[repo['name']] = dig_objs_per_repo print("Done") # print(json_data)
import json, csv, runtime from asnake.client import ASnakeClient # print instructions print( 'This script replaces existing fauxcodes with real barcodes (linked in a separate csv file) in ArchivesSpace.' ) input('Press Enter to connect to ArchivesSpace and post those barcodes...') # This is where we connect to ArchivesSpace. See authenticate.py client = ASnakeClient() client.authorize() # open csv and generate dict reader = csv.DictReader(open('barcodes.csv')) # GET each top_container listed in top_containers and add to records print('The following barcodes have been updated in ArchivesSpace:') for row in reader: uri = row['uri'] container = client.get(uri).json() container['barcode'] = row['real'] post = client.post(uri, json=container).json() print(post)
#Set time interval here (to get accessions created in last 24 hours) current_time_minus_day = current_time - timedelta(hours=24) #Convert time to ISO format for comparing to create dates in ASpace current_time_minus_day = current_time_minus_day.isoformat() print("Getting all Accessions created since: " + str(current_time_minus_day)) #ASNAKE #Log Into ASpace and set repo to RL aspace_client = ASnakeClient(baseurl="[ArchivesSpace backend API URL]", username="******", password="******") aspace_client.authorize() #Set Target Repository repo = aspace_client.get("repositories/2").json() print(repo['name']) accessions_list = aspace_client.get( "repositories/2/accessions?all_ids=true").json() #Sort accessions by ASpace ID (e.g. repositories/2/accessions/1234) accessions_sorted = sorted(accessions_list) #Just get the last 20 created accessions in ASpace (based on IDs, not create time) #assuming we won't create more than 20 accessions in time interval between cron jobs #get last 20 accessions in list (most recent accession will be last in list) last_20_accessions = accessions_sorted[-20:] print("Examining last 20 accessions created in ASpace...") for accession in last_20_accessions:
def main(ID, path=None, accession=None): if path == None: if not os.path.isdir(defaultPath): raise Exception("ERROR: default path " + defaultPath + " does not exist.") path = os.path.join(defaultPath, ID) if not os.path.isdir(path): raise Exception("ERROR: no " + ID + " directory exists for ingest in " + defaultPath) else: if not os.path.isdir(path): raise Exception("ERROR: " + str(path) + " is not a valid path.") print("Reading " + path) if accession == None: print("Building SIP...") SIP = SubmissionInformationPackage() SIP.create(ID) SIP.package(path) print("SIP " + SIP.bagID + " created.") else: print("Reading accession " + accession) import asnake.logging as logging from asnake.client import ASnakeClient client = ASnakeClient() client.authorize() logging.setup_logging(stream=sys.stdout, level='INFO') call = "repositories/2/search?page=1&aq={\"query\":{\"field\":\"identifier\", \"value\":\"" + accession + "\", \"jsonmodel_type\":\"field_query\"}}" accessionResponse = client.get(call).json() if len(accessionResponse["results"]) < 1: raise Exception("ERROR: Could not find accession with ID: " + accession) else: accessionObject = json.loads( accessionResponse["results"][0]["json"]) if "id_1" in accessionObject.keys(): accessionID = accessionObject["id_0"] + "-" + accessionObject[ "id_1"] if accession != accessionID: raise Exception( "ERROR: Could not find exact accession with ID: " + accession) if not "content_description" in accessionObject.keys(): raise Exception("ERROR: no content description in " + accessionID + " accession, " + accessionObject["uri"]) if len(accessionObject["related_resources"]) < 1: raise Exception("ERROR: no related resource for " + accessionID + " accession, " + accessionObject["uri"]) else: resource = client.get( accessionObject["related_resources"][0]["ref"]).json() creator = resource["title"] if not ID.lower() == resource["id_0"].lower(): raise Exception("ERROR: accession " + accessionID + " does not link to collection ID " + ID + ". Instead linked to " + resource["id_0"]) description = accessionObject["content_description"] print("Building SIP...") SIP = SubmissionInformationPackage() SIP.create(ID) SIP.package(path) print("SIP " + SIP.bagID + " created.") SIP.bag.info["Accession-Identifier"] = accessionID SIP.bag.info["ArchivesSpace-URI"] = accessionObject["uri"] SIP.bag.info["Records-Creator"] = creator SIP.bag.info["Content-Description"] = description if "condition_description" in accessionObject.keys(): SIP.bag.info["Condition-Description"] = accessionObject[ "condition_description"] if "provenance" in accessionObject.keys(): SIP.bag.info["Provenance"] = accessionObject["provenance"] if "general_note" in accessionObject.keys(): SIP.bag.info["General-Note"] = accessionObject[ "general_note"] SIP.bag.info["Source-Location"] = path SIP.bag.info[ "Transfer-Method"] = "https://github.com/UAlbanyArchives/ingest-processing-workflow/ingest.py" print("Writing checksums...") SIP.bag.save(manifests=True) print("SIP Saved!") # List files in txt for processing print("(not) Listing files for processing...") #listFiles(ID) if accession == None: SIP.extentLog( "/media/SPE/DigitizationExtentTracker/DigitizationExtentTracker.xlsx" ) print("Logged ingest to DigitizationExtentTracker.") else: print("Updating accession " + accessionID) if "disposition" in accessionObject.keys(): accessionObject["disposition"] = accessionObject[ "disposition"] + "\n" + str(SIP.bagID) else: accessionObject["disposition"] = str(SIP.bagID) totalSize = SIP.size() inclusiveDates = SIP.dates() extent = { "jsonmodel_type": "extent", "portion": "whole", "number": str(totalSize[0]), "extent_type": str(totalSize[1]) } extentFiles = { "jsonmodel_type": "extent", "portion": "whole", "number": str(totalSize[2]), "extent_type": "Digital Files" } if inclusiveDates[0] == inclusiveDates[1]: date = { "jsonmodel_type": "date", "date_type": "inclusive", "label": "creation", "begin": inclusiveDates[0], "expression": inclusiveDates[0] } else: date = { "jsonmodel_type": "date", "date_type": "inclusive", "label": "creation", "begin": inclusiveDates[0], "end": inclusiveDates[1] } if "extents" in accessionObject.keys(): accessionObject["extents"].append(extent) accessionObject["extents"].append(extentFiles) else: accessionObject["extents"] = [extent, extentFiles] accessionObject["dates"].append(date) updateAccession = client.post(accessionObject["uri"], json=accessionObject) if updateAccession.status_code == 200: print("\tSuccessfully updated accession " + accessionID) else: print(updateAccession.text) print("\tERROR " + str(updateAccession.status_code) + "! Failed to update accession: " + accessionID) return SIP
class ASTemps(): def __init__(self): self.client = ASnakeClient() self.auth = self.client.authorize() self.all_schemas = self.get_schemas() #a list of all enumerations #COULD ALSO DO /config/enumerations/names/:enum_name self.all_enums = self.get_dynamic_enums() #gets the list of schema names self.schema_list = [key for key in self.all_schemas.keys()] #gets the type list self.type_list = list( set([ k for value in self.all_schemas.values() for k, v in value.items() ])) self.jsonmodel_pattern = re.compile( '(JSONModel)(\(:.*?\)\s)(uri|object|uri_or_object)') def get_schemas(self): schemas = self.client.get('/schemas').json() return (schemas) def get_schema(self, schema): schema = self.client.get('/schemas/' + schema).json() return (schema) def get_dynamic_enums(self): enums = self.client.get('/config/enumerations').json() return (enums) def parse_jsonmodel(self, obj_value): #reg ex to capture all jsonmodel references in schema #jsonmodel = re.compile('(JSONModel)(\(:.*?\)\s)(uri|object|uri_or_object)') logging.debug('starting jsonmodel') if self.jsonmodel_pattern.match(obj_value): logging.debug('match with ' + str(obj_value)) #gets the name of the schema stripped_string = obj_value[obj_value.find("(") + 1:obj_value.find(")")][1:] if stripped_string != 'repository': logging.debug('Getting schema for: ' + stripped_string) jsonmodel_schema = self.all_schemas[stripped_string] #wondering if this is where the problem is??? I know this works in some cases if 'uri' in obj_value: logging.debug('uri in obj_value') parsed_json = {'ref': jsonmodel_schema['uri']} logging.debug(str(parsed_json)) #LOL this also gets digital objects if 'object' in obj_value: if 'digital_object' not in obj_value: logging.debug('object in obj_value') #workaround for testing - infinite recursion - but only fixes part of it... if stripped_string == 'note_outline_level': parsed_json = None else: #THIS IS BROKEN!!!! INFINITE RECURSION logging.debug("obj_value " + str(obj_value)) logging.debug('running parse_schema on ' + str(obj_value)) parsed_json = self.parse_schema( stripped_string, jsonmodel_schema) #saves lots of memory, likely will not change. if stripped_string == 'repository': parsed_json = {'ref': '/repositories/:repo_id'} return parsed_json #still more to do with the other ref properties def parse_refs(self, schema_name, obj_name, obj_value): logging.debug('starting parse_refs on ' + str(schema_name) + ' ' + str(obj_name)) #go through the properties of the refs if 'properties' in obj_value: logging.debug('properties in ' + str(obj_value)) if 'ref' in obj_value['properties']: logging.debug('ref in properties') if type(obj_value['properties']['ref']['type']) is list: logging.debug('Type of ref is list') logging.debug("obj_value['properties']['ref']['type']: " + str(obj_value['properties']['ref']['type'])) ref_list = [] for ref in obj_value['properties']['ref']['type']: logging.debug('Looping through ref list') logging.debug(obj_value['properties']['ref']['type']) logging.debug(ref['type']) #FIX THIS parsed_ref = self.parse_jsonmodel(ref['type']) logging.debug('parsed ref ' + str(parsed_ref)) ref_list.append(parsed_ref) logging.debug('ref_list: ' + str(ref_list)) return ref_list else: logging.debug('Type of ref is not list') if self.jsonmodel_pattern.match( obj_value['properties']['ref']['type']): logging.debug( 'RE match ' + str(obj_value['properties']['ref']['type'])) logging.debug('calling parse_jsonmodel') parsed_ref = self.parse_jsonmodel( obj_value['properties']['ref']['type']) return parsed_ref else: logging.debug('properties not in ' + str(obj_name) + 'value dictionary') logging.debug(str(obj_value['ref']['type'])) if self.jsonmodel_pattern.match(obj_value['ref']['type']): logging.debug( str(obj_value['ref']['type']) + ' matches jsonmodel pattern') logging.debug('Calling parse_jsonmodel on ' + str(obj_value['ref']['type'])) parsed_ref = self.parse_jsonmodel(obj_value['ref']['type']) return parsed_ref def parse_enums(self, enum_name): enum_list = [] for enum in self.all_enums: if enum['name'] == enum_name: for ev in enum['enumeration_values']: enum_list.append(ev['value']) return enum_list def parse_schema(self, schema_name, schema_def): try: logging.debug("Working on schema: " + str(schema_name)) template_dict = {} #Fixes infinite recursion for now exclusions = [ 'collection_management', 'rights_statement', 'rights_statement_act', 'note_rights_statement', 'note_rights_statement_act', 'children', 'deaccessions', '_inherited', 'rights_statements', 'external_id' ] for prop_name, prop_value in schema_def['properties'].items(): logging.debug("Working on prop: " + str(prop_name)) if schema_name in exclusions: print(schema_name + ' in exclusion list') continue elif prop_name in exclusions: print(str(prop_name) + ' in exclusion list') continue #If there is more than one type it will be stored in a list. elif type(prop_value['type']) is list: ''' INTEGER/STRING This is always (and only? )the lock version. Don't need to do anything with it, but will keep in the check in case the schema changes. ''' #WHAT WOULD HAPPEN IF I JUST SKIPPED ALTOGETHER - NOTHING STILL F****D!! # if prop_value['type'] == ['integer', 'string']: # if prop_name == 'lock_version': # logging.debug(schema_name, prop_name, prop_value) # continue # if prop_name != 'lock_version': # template_dict[prop_name] = None ''' What is this doing??? ''' if 'query' in prop_value['type'][0]: continue #logging.debug(schema_name, prop_name, prop_value) ''' What is this doing??? ''' if type(prop_value['type'][0]) is dict: continue #if 'agent' in prop_value['type'][0]['type']: #logging.debug(schema_name, prop_name, prop_value) #If there is only one type it won't be in a list. else: ''' JSONMODEL TYPES Can be either an object or URI. Refers to another schema or a reference to another object. i.e. date subrecords, location URIs ''' if self.jsonmodel_pattern.match(prop_value['type']): logging.debug('Regex match, ' + str(prop_value['type'])) #Don't add read-only fields to the template. Might want to change this #in the case of URIs or IDs...but don't worry about it for now. if 'readonly' in prop_value: logging.debug('Property value is readonly') if 'subtype' in prop_value: logging.debug('Subtype in property value') if prop_value['subtype'] == 'ref': logging.debug( 'Subtype of ' + str(prop_name) + 'is ref, calling parse_jsonmodel on ' + str(prop_value['tyoe'])) template_dict[ prop_name] = self.parse_jsonmodel( prop_value['type']) else: logging.debug( 'readonly not in property value dict, calling parse_jsonmodel on ' + str(prop_value['type'])) template_dict[prop_name] = self.parse_jsonmodel( prop_value['type']) elif prop_value['type'] == 'array': logging.debug('Prop value type is array') #this will always be the case I think? Check if 'items' in prop_value: #no need to have readonly fields in template??? #if there is more than one type if type(prop_value['items']['type']) is list: logging.debug('Type of array items is list') template_dict[prop_name] = [] #this might always be object??? check and see for prop_type in prop_value['items']['type']: if self.jsonmodel_pattern.match( prop_type['type']): parsed_json = self.parse_jsonmodel( prop_type['type']) template_dict[prop_name].append( parsed_json) if prop_type['type'] is 'object': logging.debug(schema_name, prop_name, prop_value) #If there is only one type... else: logging.debug('Type of array items is object') if prop_value['items']['type'] is 'object': if 'subtype' in prop_value['items']: #these usually have properties if 'properties' in prop_value['items']: template_dict[ prop_name] = self.parse_refs( schema_name, prop_name, prop_value) else: if 'properties' in prop_value['items']: logging.debug( schema_name, schema_name, prop_name, prop_value) if prop_value['items']['type'] == 'string': if 'enum' in prop_value['items']: template_dict[prop_name] = prop_value[ 'items']['enum'] #if it matches the object pattern if self.jsonmodel_pattern.match( prop_value['items']['type']): logging.debug(prop_name) logging.debug( str(prop_value['items']['type'])) parsed_json = self.parse_jsonmodel( prop_value['items']['type']) template_dict[prop_name] = [parsed_json] #Changing this from 'is' to '==' causes infinite recursion. Interestingly changing it above causes many #fields to be removed from the templates - 2 other instances of is/== 'object' elif prop_value['type'] == 'object': logging.debug('Prop value type is object') if 'properties' in prop_value: if 'subtype' in prop_value: logging.debug( 'subtype in prop value, calling parse_refs on ' + str(schema_name) + ' ' + str(prop_name)) #these are all refs I think template_dict[prop_name] = self.parse_refs( schema_name, prop_name, prop_value) else: logging.debug('subtype not in prop_value: ') logging.debug(schema_name, prop_name, prop_value) elif prop_value['type'] == 'string': logging.debug('Prop value is string') #enums are always strings if 'readonly' not in prop_value: logging.debug( 'readonly not in prop value dictionary') if 'enum' in prop_value: template_dict[prop_name] = prop_value['enum'] if 'dynamic_enum' in prop_value: template_dict[prop_name] = self.parse_enums( prop_value['dynamic_enum']) else: template_dict[prop_name] = None elif prop_value['type'] in [ 'integer', 'boolean', 'date', 'date-time', 'number' ]: logging.debug( 'Prop value is type int, bool, date, date-time, number' ) #make sure this is correct, as in not missing something that should be there if 'readonly' not in prop_value: logging.debug( 'readonly not in prop value dictionary') template_dict[prop_name] = None else: logging.debug('Value not of a recognized type') except KeyError: logging.debug('KeyError: ' + schema_name + ' ' + prop_name) except Exception as exc: logging.debug('Error: ' + schema_name + ' ' + prop_name) logging.debug(traceback.format_exc()) finally: template_dict['jsonmodel_type'] = schema_name return template_dict #QUESTION - SHOULD I CREATE LITTLE FUNCTIONS FOR EACH TYPE - i.e if whatever is 'object', #then do function stuff...might help with the nesting #want to go through each schema and create a sample dictionary template #need to be able to handle just one schema def parse_schemas(self, schemas): template_dict = {} for schema_name, schema_def in schemas.items(): #check for a parent - but one that isn't "abstract" because those fields are the same #WHAT TO DO WITH THIS???? # if 'parent' in schema_def: # pass temp = self.parse_schema(schema_name, schema_def) template_dict[schema_name] = temp return template_dict def create_csv_template(self, jsontemplatedict): ''' Goal is to create the JSON templates, and then convert those to CSV file that can be used to create either full finding aids/top level records, or to update subrecords in bulk ''' fileob = open(jsontemplatedict['jsonmodel_type'] + '.csv', 'a', encoding='utf-8', newline='') csvout = csv.writer(fileob) subfield_list = [] for key, value in jsontemplatedict.items(): if type(value) is list: #should I just check the first one instead of looping through all? if type(value[0]) is dict: for item in value: for k in item.keys(): subfield_list.append( jsontemplatedict['jsonmodel_type'] + '_' + key + '_' + k) #only two options for lists, correct? if type(value[0]) is not dict: #this means that it's just a list of enums probably - right?? No other list formats #do I need the check now that I removed the loop? check = jsontemplatedict['jsonmodel_type'] + '_' + key if check not in subfield_list: subfield_list.append( jsontemplatedict['jsonmodel_type'] + '_' + key) else: subfield_list.append(jsontemplatedict['jsonmodel_type'] + '_' + key) csvout.writerow(subfield_list) fileob.close() return subfield_list #Wrapper loop to create all templates def create_csv_templates(self, jsontemplates): for template_key, template_value in jsontemplates.items(): self.create_csv_template(template_value) def download_templates(self, jsontemplates): for template_key, template_value in jsontemplates.items(): outfile = open(str(template_key) + '.json', 'w', encoding='utf-8') json.dump(template_value, outfile, sort_keys=True, indent=4)
"/repositories/4/resources/1001", "/repositories/4/resources/4048", "/repositories/2/resources/633", "/repositories/2/resources/723", "/repositories/2/resources/748", "/repositories/2/resources/414"] # "/repositories/5/resources/5071" - UA collection - Steve to check with Kat for resource_id in resource_ids: unknown_count = 0 uri_breakup = resource_id.split("/") res_id = uri_breakup[4] repo_id = uri_breakup[2] try: rl_repo = aspace.repositories(repo_id) resource_record = rl_repo.resources(res_id).tree resource_tree = resource_record.walk print(rl_repo.resources(res_id).json()["title"]) for node in resource_tree: ao_json = client.get(node.uri).json() for instance in ao_json["instances"]: if "sub_container" in instance.keys(): indicators = [] types = [] for key, value in instance["sub_container"].items(): if "indicator_" in key: if "unknown container" == value: child_type = "type_" + str(key[-1]) indicators.append(key) types.append(child_type) unknown_count += 1 for indicator in indicators: try: del instance["sub_container"][indicator] except Exception as e:
#read existing exported collection data collectionData = [] #collectionFile = open(os.path.join(staticData, "collections.csv"), "r", encoding='utf-8') #for line in csv.reader(collectionFile, delimiter="|"): # collectionData.append(line) #collectionFile.close() #read existing exported subject data subjectData = [] #subjectFile = open(os.path.join(staticData, "subjects.csv"), "r", encoding='utf-8') #for line in csv.reader(subjectFile, delimiter="|"): # subjectData.append(line) #subjectFile.close print ("\tQuerying ArchivesSpace...") modifiedList = client.get("repositories/3/resources?all_ids=true&modified_since=" + str(startTime)).json() if len(modifiedList) > 0: print ("\tFound " + str(len(modifiedList)) + " new records!") print ("\tArchivesSpace URIs: " + str(modifiedList)) else: print ("\tFound no new records.") for colID in modifiedList: collection = client.get("repositories/3/resources/" + str(colID)).json() if collection["publish"] != True: print ("\t\tSkipping " + collection["title"] + " because it is unpublished") else: print ("\t\tExporting " + collection["title"] + " " + "(" + collection["id_0"] + ")") try: normalName = collection["finding_aid_title"] except:
class ArchivesSpaceClient(object): """Client to get and receive data from ArchivesSpace.""" def __init__(self, baseurl, username, password, repo_id): self.client = ASnakeClient(baseurl=baseurl, username=username, password=password) self.repo_id = repo_id if not self.client.authorize(): raise ArchivesSpaceClientError( "Couldn't authenticate user credentials for ArchivesSpace") self.TYPE_LIST = { "family": ["agent_family", "agents/families"], "organization": ["agent_corporate_entity", "agents/corporate_entities"], "person": ["agent_person", "agents/people"], "component": [ "archival_object", "repositories/{repo_id}/archival_objects".format( repo_id=self.repo_id) ], "accession": [ "accession", "repositories/{repo_id}/accessions".format( repo_id=self.repo_id) ], "digital object": [ "digital_objects", "repositories/{repo_id}/digital_objects".format( repo_id=self.repo_id) ] } def send_request(self, method, url, data=None, **kwargs): """Base method for sending requests to ArchivesSpace.""" r = getattr(self.client, method)(url, data=json.dumps(data), **kwargs) if r.status_code == 200: return r.json() else: if r.json()["error"].get("id_0"): """Account for indexing delays by bumping up to the next accession number.""" id_1 = int(data["id_1"]) id_1 += 1 data["id_1"] = str(id_1).zfill(3) return self.create(data, "accession") raise ArchivesSpaceClientError( "Error sending {} request to {}: {}".format( method, url, r.json()["error"])) def retrieve(self, url, **kwargs): return self.send_request("get", url, **kwargs) def create(self, data, type, **kwargs): return self.send_request("post", self.TYPE_LIST[type][1], data, **kwargs) def update(self, uri, data, **kwargs): return self.send_request("post", uri, data, **kwargs) def get_or_create(self, type, field, value, last_updated, consumer_data): """ Attempts to find and return an object in ArchivesSpace. If the object is not found, creates and returns a new object. """ model_type = self.TYPE_LIST[type][0] endpoint = self.TYPE_LIST[type][1] query = json.dumps({ "query": { "field": field, "value": value, "jsonmodel_type": "field_query" } }) try: r = self.client.get("repositories/{}/search".format(self.repo_id), params={ "page": 1, "type[]": model_type, "aq": query }).json() if len(r["results"]) == 0: r = self.client.get(endpoint, params={ "all_ids": True, "modified_since": last_updated - 120 }).json() for ref in r: r = self.client.get("{}/{}".format(endpoint, ref)).json() if r[field] == str(value): return r["uri"] return self.create(consumer_data, type).get("uri") return r["results"][0]["uri"] except Exception as e: raise ArchivesSpaceClientError( "Error finding or creating object in ArchivesSpace: {}".format( e)) def next_accession_number(self): """ Finds the next available accession number by searching for accession numbers with the current year, and then incrementing. Assumes that accession numbers are in the format YYYY NNN, where YYYY is the current year and NNN is a zero-padded integer. """ current_year = str(date.today().year) try: query = json.dumps({ "query": { "field": "four_part_id", "value": current_year, "jsonmodel_type": "field_query" } }) r = self.client.get("repositories/{}/search".format(self.repo_id), params={ "page": 1, "type[]": "accession", "sort": "identifier desc", "aq": query }).json() number = "1" if r.get("total_hits") >= 1: if r["results"][0]["identifier"].split("-")[0] == current_year: id_1 = int(r["results"][0]["identifier"].split("-")[1]) id_1 += 1 number = str(id_1).zfill(3) return ":".join([current_year, number.zfill(3)]) except Exception as e: raise ArchivesSpaceClientError( "Error retrieving next accession number from ArchivesSpace: {}" .format(e))
"literal": True }, { "jsonmodel_type": "field_query", "field": "source", "value": "viaf", "literal": True }] } }) ASoutput = list(client.get_paged("/search", params={"filter": query})) print('Found ' + str(len(ASoutput)) + ' agents.') # grab uri out of agent for person in ASoutput: uri = person['uri'] personRecord = client.get(uri).json() lockVersion = str(personRecord['lock_version']) primary_name = personRecord['names'][0]['primary_name'] try: secondary_name = personRecord['names'][0]['rest_of_name'] except: secondary_name = '' try: dates = personRecord['names'][0]['dates'] except: dates = '' searchName = primary_name + ', ' + secondary_name + ', ' + dates nameEdited = quote(searchName.strip()) url = viafURL + nameEdited + '%22+and+local.sources+%3D+%22lc%22&sortKeys=holdingscount&maximumRecords=1&httpAccept=application/rdf+json' # first need to treat the response as text since we get an xml resopnse (with json embedded inside) response = requests.get(url).text
# add the handlers to the logger logger.addHandler(fh) logger.addHandler(ch) config = configparser.ConfigParser() config.read('settings.ini') args.config = config try: client = ASnakeClient( baseurl=config['aspace_credentials']['api_host'], username=config['aspace_credentials']['username'], password=config['aspace_credentials']['password']) except KeyError as e: logger.error('settings.ini does not exist or is invalid') raise e # Simple sanity check to make sure client is setup try: resp = client.get('/') if not resp.ok: resp.raise_for_status() except: logger.error('Unable to contact ArchivesSpace instance at %s' % config['aspace_credentials']['api_host']) raise APIContactError( 'Unable to contact ArchivesSpace instance at %s' % config['aspace_credentials']['api_host']) main_menu()
for k in d: if isinstance(d[k], list): for i in d[k]: for j in findKey(i, key): yield j repository = input('Enter Repository ID: ') resourceID = input('Enter resource ID: ') client = ASnakeClient() client.authorize() endpoint = '/repositories/' + repository + '/resources/' + resourceID + '/tree' output = client.get(endpoint).json() archivalObjects = [] for value in findKey(output, 'record_uri'): if 'archival_objects' in value: archivalObjects.append(value) records = [] for archivalObject in archivalObjects: output = client.get(archivalObject).json() records.append(output) f = open('archivalObjects.json', 'w') json.dump(records, f) f.close()