def main(): # Main code goes here. asf.setServer("Prod") output_folder = "output/resource_remove_links" the_lookup_csv = "id_lookup_prod.csv" bibid_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-161-remove-links/acfa-161-remove-links.txt" # Read a list of bibids (csv) the_bibids = [] with open(bibid_file) as ids: for row in csv.reader(ids): the_bibids.append(row[0]) for b in the_bibids: try: repo, asid = asf.lookupByBibID(b, the_lookup_csv) print("Processing " + str(b) + "...") out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json") out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json") x = asf.getResource(repo, asid) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(x) x_dict = json.loads(x) print(x_dict["ead_location"]) if "ead_location" in x_dict: del x_dict["ead_location"] else: print("No URL to delete!") y = json.dumps(x_dict) # print(y) post = asf.postResource(repo, asid, y) print(post) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(y) except: print("Error: Could not process " + str(b)) print(sys.exc_info()) # raise quit()
def main(): # Main code goes here. asf.setServer("Prod") lookup_csv = "id_lookup_prod.csv" id_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-226-oclc/035s_20200915.txt" # Read a list of bibids and oclc strings the_data = [] with open(id_file) as ids: for row in csv.reader(ids, delimiter="|"): the_data.append([row[0], row[1], row[2]]) for a_row in the_data: bibid = a_row[0] print(bibid) str_2 = a_row[1] str_3 = a_row[2] try: repo, asid = asf.lookupByBibID(bibid, lookup_csv) x = asf.getResource(repo, asid) y = json.loads(x) user_defnd = y["user_defined"] if "user_defined" in y else {} user_defnd["string_2"] = str_2 user_defnd["string_3"] = str_3 print(user_defnd) y["user_defined"] = user_defnd z = json.dumps(y) post = asf.postResource(repo, asid, z) print(post) except Exception as e: print(e + ": Could not lookup " + str(bibid))
def main(): # Main code goes here. my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) asf.setServer("Prod") the_sheet = dataSheet("1UQm7ffd1Kq4zqlzHZajd9YkwW1_nmOJFS1W7nI-c_Vk", "new-batch!A:Z") output_folder = os.path.join(my_path, "output/resource_collecting_area") the_rows = the_sheet.getData() the_new_rows = [] the_heads = the_rows.pop(0) the_new_rows.append(the_heads) coll_area_index = 8 # the column of collecting area for a_row in the_rows: the_new_row = a_row # print(a_row) coll = "" repo, asid = a_row[0], a_row[1] if len(a_row) >= coll_area_index: # if there is a collecting location to add coll = a_row[coll_area_index] the_resource = asf.getResource(repo, asid) out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json") out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json") # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) the_data = json.loads(the_resource) fix = False if "user_defined" in the_data: the_user_defined = the_data["user_defined"] if "enum_4" in the_user_defined: print("Already has enum_4! Skipping.") else: fix = True the_user_defined["enum_4"] = coll the_data["user_defined"] = the_user_defined the_new_resource = json.dumps(the_data) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(the_new_resource) if fix == True: try: post = "[NONE]" post = asf.postResource(repo, asid, the_new_resource) print(post) except: print("Error: There was a problem posting resource " + str(repo) + ":" + str(asid) + "!") the_new_row.append(coll) else: print("ERROR: No user_defined data in " + str(repo) + ":" + str(asid)) the_new_rows.append(the_new_row) the_sheet.clear() the_sheet.appendData(the_new_rows) # print(the_new_rows) quit()
def main(): # set to True to use test sheet and test json folder location. debug = False asf.setServer("Prod") my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = "" # set later today_str = str((date.today()).strftime("%Y%m%d")) if debug: print("[Running script in debug mode...]") parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources" # test folder sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U" # test sheet the_repos = [4] # to test else: parent_folder = "/cul/cul0/ldpd/archivesspace/resources" sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg" the_repos = [2, 3, 4, 5, 6] output_folder = parent_folder + "/" + today_str the_sheets = { "resources": dataSheet(sheet_id, "Resources!A:Z"), "cm": dataSheet(sheet_id, "Collection Management!A:Z"), "log": dataSheet(sheet_id, "log!A:Z"), } # Set number of chars to truncate the scope and bioghist notes. trunc_len = 400 # List of fields to extract, expressed as dpaths. the_fields = [ ["bibid", "/id_0"], ["title", "/title"], ["published", "/publish"], ["create_time", "/create_time"], ["system_mtime", "/system_mtime"], ["created_by", "/created_by"], ["last_modified_by", "/last_modified_by"], ["ead_location", "/ead_location"], ["ext_number", "/extents/0/number"], ["ext_portion", "/extents/0/portion"], ["ext_type", "/extents/0/extent_type"], # ["integer_1", "/user_defined/integer_1"], # ["integer_2", "/user_defined/integer_2"], # ["integer_3", "/user_defined/integer_3"], ["local call no.", "/user_defined/string_1"], ["other ctrl no. 1", "/user_defined/string_2"], ["other ctrl no. 2", "/user_defined/string_3"], ["other ctrl no. 3", "/user_defined/string_4"], # ["enum_1", "/user_defined/enum_1"], # ["enum_2", "/user_defined/enum_2"], ["description status", "/user_defined/enum_3"], ["collecting area", "/user_defined/enum_4"], ["level", "level"] # (Scope and bioghist notes are added in separately below.) ] # Get the collection management records for use in report. the_cms = [] fields = [ "id", "parent_id", "title", "system_mtime", "processing_priority", "processing_status", ] print(" ") print("*** Retrieve Collection Management Data ***") print(" ") for r in the_repos: print("Getting collection management records for repo: " + str(r) + "...") cm = asf.getCollectionManagements(r, filter="resource", fields=fields) for c in cm: row = [c[f] for f in fields] the_cms.append(row) # a data set of collection managment records to post to sheet below. the_cms.insert(0, fields) print(" ") print("*** Retrieve Resource Data ***") print(" ") # Get the list of resources for each repo and add to the_ids the_ids = [] for r in the_repos: print("Getting ids for repo: " + str(r) + "...") asids = json.loads( asf.getResponse("/repositories/" + str(r) + "/resources?all_ids=true")) print(str(len(asids)) + " records found in repo " + str(r) + ".") for i in asids: the_ids.append([r, i]) # Construct the head row the_heads = [x[0] for x in the_fields] the_heads.insert(0, "asid") the_heads.insert(0, "repo") the_heads.append("scope note") the_heads.append("scopenote length") the_heads.append("bioghist note") the_heads.append("biognote length") the_output = [the_heads] # Fetch the resources from the ids print("Downloading resources...") if not os.path.exists(output_folder): print("Creating directory " + output_folder + "...") os.makedirs(output_folder) for repo, asid in the_ids: # print("Processsing " + str(repo) + ":" + str(asid) + "...") the_row = [repo, asid] res_json = asf.getResource(repo, asid) res_dict = json.loads(res_json) out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json" # Write the JSON to file. with open(out_path, "w+") as f: f.write(res_json) # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(res_dict, af[1])) except: d = "" the_row.append(d) # Process scope and bioghist notes the_notes = dpath.util.values(res_dict, "notes/*", afilter=None) the_scope_notes = [] the_biog_notes = [] for a_note in the_notes: try: if a_note["type"] == "scopecontent": the_scope_notes.append(a_note) except: pass try: if a_note["type"] == "bioghist": the_biog_notes.append(a_note) except: pass if the_scope_notes: # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars. scope_note_texts = [ s["subnotes"][0]["content"] for s in the_scope_notes ] the_scope_text = " ".join(scope_note_texts) scope_note_len = len(the_scope_text) scope_note_short = truncate_str(the_scope_text, length=trunc_len) else: scope_note_short = "" scope_note_len = 0 if the_biog_notes: # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars. biog_note_texts = [ s["subnotes"][0]["content"] for s in the_biog_notes ] the_biog_text = " ".join(biog_note_texts) biog_note_len = len(the_biog_text) biog_note_short = truncate_str(the_biog_text, length=trunc_len) else: biog_note_short = "" biog_note_len = 0 the_row.append(scope_note_short) the_row.append(str(scope_note_len)) the_row.append(biog_note_short) the_row.append(str(biog_note_len)) the_output.append(the_row) # Zip up the JSON files for storage. zip_out = make_archive(today_str, "zip", root_dir=parent_folder, base_dir=today_str) print(zip_out) # Zip is saved in working dir; move to correct location. print("Saving zip file " + str(today_str) + ".zip to " + parent_folder) # Test if file already exists. if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"): print("File " + parent_folder + "/" + str(today_str) + ".zip exists already. Replacing with new zip file...") os.remove(parent_folder + "/" + str(today_str) + ".zip") move(zip_out, parent_folder) # Remove the json folder once zip is in place. rmtree(parent_folder + "/" + today_str) util.file_cleanup(parent_folder, 60) # Write output to Google sheet. print(" ") print("*** Writing Data to Report ***") print(" ") the_sheets["cm"].clear() the_sheets["cm"].appendData(the_cms) digester.post_digest( script_name, "Total collection management records: " + str(len(the_cms) - 1)) the_sheets["resources"].clear() the_sheets["resources"].appendData(the_output) digester.post_digest( script_name, "Total number of resource records: " + str(len(the_output) - 1)) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + my_name + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") the_sheets["log"].appendData([[the_log]]) print(" ") print(the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ the_sheets["resources"].url print(exit_msg) digester.post_digest(script_name, exit_msg)
x = the_sheet.matchingRows([['BIBID', '4079432'], ['Title', '.*Humph.*']]) print(x) print(' ') x = the_sheet.lookup('4079432', 0, 1) print(x) print(' ') print('testing archivesspace api...') x = asf.getResource(2, 5907) print(x) print(' ') print("testing saxon ...") saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar') source_dir = '/cul/cul0/ldpd/archivesspace/oai' in_file = os.path.join(source_dir, '20201111.asClean.xml') xsl_file = os.path.join(my_path, '../xslt/extract-bibids.xsl') params = 'filename=' + in_file x = util.saxon_process(in_file, xsl_file, None, theParams=params) print(x)
def main(): # Main code goes here. asf.setServer("Prod") on_site = False # set to True to get on-site note, False to get off-site note. See the_access_note var below. output_folder = "output/resource_on-site_access" lookup_csv = "id_lookup_prod.csv" # bibid_file = ( # "/Users/dwh2128/Documents/ACFA/TEST/ACFA-224-onsite-notes/acfa-224-list_3.csv" # ) bibid_file = ( "/Users/dwh2128/Documents/ACFA/TEST/ACFA-243-off-site/acfa-243_off-site.csv" ) # Read a list of bibids (csv) the_bibids = [] with open(bibid_file) as ids: for row in csv.reader(ids): the_bibids.append(row[0]) if on_site == True: the_access_note = { "jsonmodel_type": "note_multipart", "label": "Restrictions on Access", "type": "accessrestrict", "rights_restriction": {"local_access_restriction_type": []}, "subnotes": [ { "jsonmodel_type": "note_text", "content": "This collection is located on-site.", "publish": True, } ], "publish": True, } else: the_access_note = { "jsonmodel_type": "note_multipart", "label": "Restrictions on Access", "type": "accessrestrict", "rights_restriction": {"local_access_restriction_type": []}, "subnotes": [ { "jsonmodel_type": "note_text", "content": "This collection is located off-site. You will need to request this material at least three business days in advance to use the collection in the Rare Book and Manuscript Library reading room.", "publish": True, } ], "publish": True, } for bib in the_bibids: try: repo, asid = asf.lookupByBibID(bib, lookup_csv) except: print("Error: No record found for " + str(bib) + ". Skipping...") continue out_path_old = output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json" out_path_new = output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json" the_resource = asf.getResource(repo, asid) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) the_data = json.loads(the_resource) # Test if there is already an access restriction note. has_note = False for a_note in the_data["notes"]: try: if a_note["type"] == "accessrestrict": has_note = True except KeyError: print("Note has no type -- skipping.") if has_note == True: print(str(bib) + " - Warning: Already has access note.") # else: the_data["notes"].append(the_access_note) the_new_resource = json.dumps(the_data) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(the_new_resource) try: post = asf.postResource(repo, asid, the_new_resource) print(post) except: print( "Error: There was a problem posting resource " + str(repo) + ":" + str(asid) + "!" ) quit()
def main(): asf.setServer("Prod") # the_lookup_csv = "id_lookup_TEST.csv" # test the_lookup_csv = "id_lookup_prod.csv" # test output_folder = "output/resource_language_encode" the_sheet = dataSheet("1eTPY7AbDvjDU-lzK2VQruvZAvlGkAJZglh2JrruPvdg", "Test6!A:Z") the_data = the_sheet.getData() the_new_data = [] the_new_data.append(the_data.pop(0)) counter = 0 for a_row in the_data: counter += 1 print(" ") print(counter) the_new_row = a_row the_bibid = a_row[0] the_041 = a_row[1] the_string = a_row[3] res_info = asf.lookupByBibID(the_bibid, the_lookup_csv) if res_info: out_path_old = ( output_folder + "/" + str(res_info[0]) + "_" + str(res_info[1]) + "_old.json" ) out_path_new = ( output_folder + "/" + str(res_info[0]) + "_" + str(res_info[1]) + "_new.json" ) # pull down the resource the_resource = asf.getResource(res_info[0], res_info[1]) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) res_dict = json.loads(the_resource) langmaterials = res_dict["lang_materials"] # Collect encoded languages already present. There should be just one but not guaranteed, so make a list. primary_langs = [] for n in langmaterials: try: if n["language_and_script"]: # print("YES") primary_langs.append(n["language_and_script"]["language"]) except: print("Exception!") print("old:") print(primary_langs) print("new:") langs_parsed = language_lookup(the_string) print(langs_parsed) print("to add: ") langs_diff = diff(langs_parsed, primary_langs) print(langs_diff) if len(langs_diff) > 0: for l in langs_diff: res_dict["lang_materials"].append(make_language_note(l)) new_resource = json.dumps(res_dict) # Save new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(new_resource) # Post new resource back to API print("Posting data for " + str(res_info[0]) + " : " + str(res_info[1])) try: post = asf.postResource(res_info[0], res_info[1], new_resource) print(post) except: print( "Error: There was a problem posting resource " + str(res_info[0]) + ":" + str(res_info[1]) + "!" ) langs_diff.append("[ERROR]") else: print("No new languages to add. Skipping.") the_new_row.append(",".join(langs_diff)) the_new_data.append(the_new_row) the_sheet.clear() the_sheet.appendData(the_new_data)
def test_get_resource_dev(): x = json.loads(asf.getResource(2, 5907)) assert x[ 'id_0'] == '4078601', "Dev: BIBID for resource 2:5907 should be 4078601"
def main(): asf.setServer('Test') # Google sheet used for reporting changes. the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'resources!A:Z') id_file = 'resource_replacements.csv' output_folder = 'output/resource_replacements' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() # Search/replace patterns the_search_pattern = 'NCC' the_replace_pattern = 'NNC' the_before_afters = [] the_heads = ['repo', 'asid', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API x = asf.getResource(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) the_old_field_data = x['user_defined']['string_2'] y = x y['user_defined']['string_2'] = re.sub(the_search_pattern, the_replace_pattern, x['user_defined']['string_2']) if y['user_defined']['string_2'] == the_old_field_data: the_new_field_data = "[no change]" else: the_new_field_data = y['user_defined']['string_2'] the_before_afters.append([ an_obj[0], an_obj[1], '{string_2} ' + the_old_field_data, '{string_2} ' + the_new_field_data ]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. post = asf.postResource(an_obj[0], an_obj[1], z) print(post) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters)