def main(): # Test functions here. from pprint import pprint server = 'Test' asf.setServer(server) # The resource to scan the_resource = (4, 6288) # A place to put output of saved json objects (optional) output_folder = 'output/replace_extrefs' # Retrieve all archival objects under a given resource x = asf.getResponse('/repositories/' + str(the_resource[0]) + '/resources/' + str(the_resource[1]) + '/ordered_records') y = json.loads(x)['uris'] # Select only the ones that are items or files, and add to a list the_refs = [r['ref'] for r in y if r['level'] in ['item', 'file']] cnt = 0 for a_ref in the_refs: ref_decomposed = a_ref.split('/') repo, asid = ref_decomposed[2], ref_decomposed[4] ref_json = asf.getArchivalObject(repo, asid) out_path = output_folder + '/' + str(repo) + '_' + str(asid) + '.json' data_old = ref_json # The regex substitution repl = re.subn(r'<extref\s+type=\\"simple\\"\s+href=', r'<extref xlink:type=\"simple\" xlink:href=', ref_json, flags=re.DOTALL) if repl[1] > 0: # [1] is the count of replacements from subn # there is a change # Save copy of existing object print('Saving data to ' + out_path + '....') with open(out_path, "w+") as f: f.write(data_old) data_new = repl[0] cnt += 1 print('Posting ' + str(repo) + '_' + str(asid) + ' to ' + server) z = asf.postArchivalObject(repo, asid, data_new) print(z) print(' ') print('Total replacements: ' + str(cnt))
def get_agent_data(name, endpoint, pickle_path): print("Getting agents: " + name) # out_path = os.path.join(my_path, "output/agents_" + i["name"] + ".pickle") # out_path = os.path.join(out_folder, "agents_" + i["name"] + ".pickle") # Get a list of agent ids from API agents_list = json.loads(asf.getResponse(endpoint + "?all_ids=true")) agent_cnt_str = "Number of agents (" + \ name + "): " + str(len(agents_list)) print(agent_cnt_str) log_it(SCRIPT_NAME, agent_cnt_str) agent_data = [] # Loop through agent ids and get full record from API. for cnt, agent in enumerate(agents_list): # print("COUNT: " + str(cnt)) # print("Agent # " + str(agent)) x = asf.getResponse(endpoint + "/" + str(agent)) agent_data.append(json.loads(x)) # Save data as pickle util.pickle_it(agent_data, pickle_path) return agent_data
def main(): # set to True to use test sheet and test json folder location. debug = False asf.setServer("Prod") my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = "" # set later today_str = str((date.today()).strftime("%Y%m%d")) if debug: print("[Running script in debug mode...]") parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources" # test folder sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U" # test sheet the_repos = [4] # to test else: parent_folder = "/cul/cul0/ldpd/archivesspace/resources" sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg" the_repos = [2, 3, 4, 5, 6] output_folder = parent_folder + "/" + today_str the_sheets = { "resources": dataSheet(sheet_id, "Resources!A:Z"), "cm": dataSheet(sheet_id, "Collection Management!A:Z"), "log": dataSheet(sheet_id, "log!A:Z"), } # Set number of chars to truncate the scope and bioghist notes. trunc_len = 400 # List of fields to extract, expressed as dpaths. the_fields = [ ["bibid", "/id_0"], ["title", "/title"], ["published", "/publish"], ["create_time", "/create_time"], ["system_mtime", "/system_mtime"], ["created_by", "/created_by"], ["last_modified_by", "/last_modified_by"], ["ead_location", "/ead_location"], ["ext_number", "/extents/0/number"], ["ext_portion", "/extents/0/portion"], ["ext_type", "/extents/0/extent_type"], # ["integer_1", "/user_defined/integer_1"], # ["integer_2", "/user_defined/integer_2"], # ["integer_3", "/user_defined/integer_3"], ["local call no.", "/user_defined/string_1"], ["other ctrl no. 1", "/user_defined/string_2"], ["other ctrl no. 2", "/user_defined/string_3"], ["other ctrl no. 3", "/user_defined/string_4"], # ["enum_1", "/user_defined/enum_1"], # ["enum_2", "/user_defined/enum_2"], ["description status", "/user_defined/enum_3"], ["collecting area", "/user_defined/enum_4"], ["level", "level"] # (Scope and bioghist notes are added in separately below.) ] # Get the collection management records for use in report. the_cms = [] fields = [ "id", "parent_id", "title", "system_mtime", "processing_priority", "processing_status", ] print(" ") print("*** Retrieve Collection Management Data ***") print(" ") for r in the_repos: print("Getting collection management records for repo: " + str(r) + "...") cm = asf.getCollectionManagements(r, filter="resource", fields=fields) for c in cm: row = [c[f] for f in fields] the_cms.append(row) # a data set of collection managment records to post to sheet below. the_cms.insert(0, fields) print(" ") print("*** Retrieve Resource Data ***") print(" ") # Get the list of resources for each repo and add to the_ids the_ids = [] for r in the_repos: print("Getting ids for repo: " + str(r) + "...") asids = json.loads( asf.getResponse("/repositories/" + str(r) + "/resources?all_ids=true")) print(str(len(asids)) + " records found in repo " + str(r) + ".") for i in asids: the_ids.append([r, i]) # Construct the head row the_heads = [x[0] for x in the_fields] the_heads.insert(0, "asid") the_heads.insert(0, "repo") the_heads.append("scope note") the_heads.append("scopenote length") the_heads.append("bioghist note") the_heads.append("biognote length") the_output = [the_heads] # Fetch the resources from the ids print("Downloading resources...") if not os.path.exists(output_folder): print("Creating directory " + output_folder + "...") os.makedirs(output_folder) for repo, asid in the_ids: # print("Processsing " + str(repo) + ":" + str(asid) + "...") the_row = [repo, asid] res_json = asf.getResource(repo, asid) res_dict = json.loads(res_json) out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json" # Write the JSON to file. with open(out_path, "w+") as f: f.write(res_json) # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(res_dict, af[1])) except: d = "" the_row.append(d) # Process scope and bioghist notes the_notes = dpath.util.values(res_dict, "notes/*", afilter=None) the_scope_notes = [] the_biog_notes = [] for a_note in the_notes: try: if a_note["type"] == "scopecontent": the_scope_notes.append(a_note) except: pass try: if a_note["type"] == "bioghist": the_biog_notes.append(a_note) except: pass if the_scope_notes: # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars. scope_note_texts = [ s["subnotes"][0]["content"] for s in the_scope_notes ] the_scope_text = " ".join(scope_note_texts) scope_note_len = len(the_scope_text) scope_note_short = truncate_str(the_scope_text, length=trunc_len) else: scope_note_short = "" scope_note_len = 0 if the_biog_notes: # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars. biog_note_texts = [ s["subnotes"][0]["content"] for s in the_biog_notes ] the_biog_text = " ".join(biog_note_texts) biog_note_len = len(the_biog_text) biog_note_short = truncate_str(the_biog_text, length=trunc_len) else: biog_note_short = "" biog_note_len = 0 the_row.append(scope_note_short) the_row.append(str(scope_note_len)) the_row.append(biog_note_short) the_row.append(str(biog_note_len)) the_output.append(the_row) # Zip up the JSON files for storage. zip_out = make_archive(today_str, "zip", root_dir=parent_folder, base_dir=today_str) print(zip_out) # Zip is saved in working dir; move to correct location. print("Saving zip file " + str(today_str) + ".zip to " + parent_folder) # Test if file already exists. if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"): print("File " + parent_folder + "/" + str(today_str) + ".zip exists already. Replacing with new zip file...") os.remove(parent_folder + "/" + str(today_str) + ".zip") move(zip_out, parent_folder) # Remove the json folder once zip is in place. rmtree(parent_folder + "/" + today_str) util.file_cleanup(parent_folder, 60) # Write output to Google sheet. print(" ") print("*** Writing Data to Report ***") print(" ") the_sheets["cm"].clear() the_sheets["cm"].appendData(the_cms) digester.post_digest( script_name, "Total collection management records: " + str(len(the_cms) - 1)) the_sheets["resources"].clear() the_sheets["resources"].appendData(the_output) digester.post_digest( script_name, "Total number of resource records: " + str(len(the_output) - 1)) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + my_name + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") the_sheets["log"].appendData([[the_log]]) print(" ") print(the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ the_sheets["resources"].url print(exit_msg) digester.post_digest(script_name, exit_msg)
def main(): my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = '13OakaS0KHtxcaV9HGWDP9Zfnz9TVJR_9zGUnKrb90jk' # test # sheet_id = '1tYOXSDFlkbX_revB_ULvhmCdvKkyzpipBTkYqYXcM38' # sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0' # test # sheet_id = '1OhgJ4g-SWbmnms4b3ppe_0rBT7hz9jfQp6P8mADcatk' # batch template doc container_sheet = dataSheet(sheet_id, 'containers!A:Z') marc_sheet = dataSheet(sheet_id, 'marc!A:Z') # Get a list of bibids from the Marc tab. # the_bibids = marc_sheet.getDataColumns()[0] the_bibids = marc_sheet.getDataColumns()[1] the_bibids.pop(0) the_bibids = list(set(the_bibids)) print(the_bibids) #### TOP CONTAINERS #### the_heads = [ 'bibid', 'resource', 'uri', 'type', 'display_string', 'concat' ] the_rows = [the_heads] lookup_csv = os.path.join(my_path, 'id_lookup_prod.csv') for abib in the_bibids: print(abib) # Get repo and asid from bibid repo, asid = asf.lookupByBibID(abib, lookup_csv) print('Getting top containers for ' + str(repo) + ':' + str(asid)) the_query = '/repositories/' + \ str(repo) + '/resources/' + str(asid) + '/top_containers' # list of top containers the_refs = json.loads(asf.getResponse(the_query)) print(the_refs) cnt = 0 for r in the_refs: cnt += 1 print(cnt) try: tc = json.loads(asf.getResponse(r['ref'])) # print(tc) try: bibid = tc['collection'][0]['identifier'] except: bibid = '' try: resource = tc['collection'][0]['ref'] except: resource = '' try: uri = tc['uri'] except: uri = '' try: type = tc['type'] except: type = '' try: display_string = tc['display_string'] except: display_string = '' try: concat_str = str(tc['display_string'] + ' (' + uri.split('/')[4]) + ')' except: concat_str = 'x' a_row = [ bibid, resource, uri, type, display_string, concat_str ] # print(a_row) the_rows.append(a_row) except: print(r) # Write results to google sheet container_sheet.clear() z = container_sheet.appendData(the_rows) print(z)
{ "name": "corporate", "endpoint": "/agents/corporate_entities", }, { "name": "persons", "endpoint": "/agents/people", }, ] for i in the_info: print("Getting agents: " + i["name"]) out_path = os.path.join(my_path, "output/agents_" + i["name"] + ".pickle") # Get a list of agent ids from API agents_list = json.loads(asf.getResponse(i["endpoint"] + "?all_ids=true")) agent_cnt_str = "Number of agents (" + \ i['name'] + "): " + str(len(agents_list)) print(agent_cnt_str) digester.post_digest(script_name, agent_cnt_str) cnt = 0 agent_data = [] # Loop through agent ids and get full record from API. for agent in agents_list: cnt += 1 # print("COUNT: " + str(cnt)) # print("Agent # " + str(agent))