def main(): asf.setServer('Prod') now1 = datetime.now() start_time = str(now1) end_time = '' #set later # today_str = str(date.today().strftime("%Y%m%d")) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y-%m-%d")) sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY' data_data = [{ 'range': 'resource-changes!A:Z', 'filter': 'resources' }, { 'range': 'accession-changes!A:Z', 'filter': 'accessions' }] for d in data_data: print('processing ' + d['filter']) the_sheet = dataSheet(sheet_id, d['range']) the_date = yest_str # the_date = '2019-08-27' the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_modifieds = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getByDate(r, the_date, date_type='mtime', comparator='equal', filter=d['filter'], fields=the_fields) for a in x: row = [a[v] for v in the_fields] print(row) the_modifieds.append(row) # print(list(a.values())) # the_modifieds.append(list(a.values())) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total ' + d['filter'] + ': ' + str(len(the_modifieds))) # the_sheet.clear() # the_sheet.appendData([the_fields]) the_sheet.appendData(the_modifieds) quit()
def main(): # Main code goes here. asf.setServer("Prod") output_folder = "output/resource_remove_links" the_lookup_csv = "id_lookup_prod.csv" bibid_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-161-remove-links/acfa-161-remove-links.txt" # Read a list of bibids (csv) the_bibids = [] with open(bibid_file) as ids: for row in csv.reader(ids): the_bibids.append(row[0]) for b in the_bibids: try: repo, asid = asf.lookupByBibID(b, the_lookup_csv) print("Processing " + str(b) + "...") out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json") out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json") x = asf.getResource(repo, asid) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(x) x_dict = json.loads(x) print(x_dict["ead_location"]) if "ead_location" in x_dict: del x_dict["ead_location"] else: print("No URL to delete!") y = json.dumps(x_dict) # print(y) post = asf.postResource(repo, asid, y) print(post) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(y) except: print("Error: Could not process " + str(b)) print(sys.exc_info()) # raise quit()
def main(): # Test functions here. from pprint import pprint server = 'Test' asf.setServer(server) # The resource to scan the_resource = (4, 6288) # A place to put output of saved json objects (optional) output_folder = 'output/replace_extrefs' # Retrieve all archival objects under a given resource x = asf.getResponse('/repositories/' + str(the_resource[0]) + '/resources/' + str(the_resource[1]) + '/ordered_records') y = json.loads(x)['uris'] # Select only the ones that are items or files, and add to a list the_refs = [r['ref'] for r in y if r['level'] in ['item', 'file']] cnt = 0 for a_ref in the_refs: ref_decomposed = a_ref.split('/') repo, asid = ref_decomposed[2], ref_decomposed[4] ref_json = asf.getArchivalObject(repo, asid) out_path = output_folder + '/' + str(repo) + '_' + str(asid) + '.json' data_old = ref_json # The regex substitution repl = re.subn(r'<extref\s+type=\\"simple\\"\s+href=', r'<extref xlink:type=\"simple\" xlink:href=', ref_json, flags=re.DOTALL) if repl[1] > 0: # [1] is the count of replacements from subn # there is a change # Save copy of existing object print('Saving data to ' + out_path + '....') with open(out_path, "w+") as f: f.write(data_old) data_new = repl[0] cnt += 1 print('Posting ' + str(repo) + '_' + str(asid) + ' to ' + server) z = asf.postArchivalObject(repo, asid, data_new) print(z) print(' ') print('Total replacements: ' + str(cnt))
def main(): # SERVER = "Test" # test SERVER = "Prod" asf.setServer(SERVER) LOOKUP = '/Users/dwh2128/Documents/git/dcps-utils/archivesspace/as_reports/id_lookup_prod.csv' sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI' read_sheet = dataSheet(sheet_id, 'TEST!A:Z') # Test write_sheet = dataSheet(sheet_id, 'Output!A:Z') the_data = read_sheet.getData() the_data.pop(0) # print(the_refs) the_output = [] for r in the_data: bibid = r[0] repo = r[1] ref = r[2] extref_old = r[3] extref_new = r[5] the_res = json.loads(asf.getResourceByBibID(bibid, LOOKUP)) # pprint(the_res) asid = the_res['uri'].split('/')[4] print("repo: " + str(repo) + "; asid: " + str(asid)) the_notes = json.dumps(the_res['notes']) # print(the_notes) print(" ") the_new_notes = replace_notes( the_notes, [ # fix problem of leading space in href {'find': 'xlink:href=\\" http', 'replace': 'xlink:href=\\"http'}, # replace old url with new one {'find': extref_old, 'replace': extref_new}]) # print(the_new_notes) the_res['notes'] = json.loads(the_new_notes) x = asf.postResource(repo, asid, json.dumps(the_res)) out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)] print(out_row) the_output.append(out_row) # # write_sheet.clear() write_sheet.appendData(the_output) quit()
def main(): sheet_id = "1GEeNpKBhfjOCJGx1zJfi6XgZ4OWhGhzWsOHRT9DkmpY" # list_sheet = dataSheet(sheet_id, 'Test!A:Z') # test list_sheet = dataSheet(sheet_id, "batch!A:Z") report_sheet = dataSheet(sheet_id, "output!A:Z") the_uris = list_sheet.getDataColumns()[0] output_data = [] for uri in the_uris: asid = uri.split("/")[3] x = fix_agent(asid, "families") pprint(x["display_name"]) res = asf.postAgent(asid, json.dumps(x), agent_type="families") print(res) row = [SERVER, uri, str(res)] output_data.append(row) print(output_data) report_sheet.appendData(output_data) quit()
def main(): asf.setServer('Prod') # the_repos=[2,3,4,5] the_repos=[2] the_fields = ['id','title','identifier','create_time','system_mtime','last_modified_by','json'] the_sheet=dataSheet('198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY','unpublished!A:Z') the_unpublished = [] for r in the_repos: print('searching repo ' + str(r)) x = getUnpublished(r,filter='resources',fields=the_fields) # print(x) for a in x: row = [ a[v] for v in the_fields ] my_json = json.loads(row.pop(6)) try: call_no = my_json['user_defined']['string_1'] except: call_no = '' repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the repo from the uri string. asid = int(str(row[0].split('/')[-1]).rstrip()) # get the asid from the uri string. row.pop(0) row.insert(0,asid), row.insert(0,repo_id) if 'UA' in call_no: repo = 'nnc-ua' else: repo = get_repo(repo_id) row.insert(0,repo) the_unpublished.append(row) print(row) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total unpublished: ' + str(len(the_unpublished))) # the_sheet.clear() # the_sheet.appendData([the_fields]) # the_sheet.appendData(the_unpublished) quit()
def main(): SERVER = "Prod" # test # SERVER = "Prod" asf.setServer(SERVER) sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI' read_sheet = dataSheet(sheet_id, 'TEST!A:Z') # Test write_sheet = dataSheet(sheet_id, 'Output!A:Z') the_data = read_sheet.getData() the_data.pop(0) # print(the_refs) the_output = [] for r in the_data: repo = r[1] ref = r[2] extref_old = r[3] extref_new = r[5] the_ao = json.loads(asf.getArchivalObjectByRef(repo, ref)) asid = the_ao['uri'].split('/')[4] print("asid: " + str(asid)) the_notes = json.dumps(the_ao['notes']) # fix problem of leading space in href the_new_notes = the_notes.replace('xlink:href=\\" http', 'xlink:href=\\"http') # replace old url with new one the_new_notes = the_new_notes.replace(extref_old, extref_new) print(the_new_notes) the_ao['notes'] = json.loads(the_new_notes) pprint(the_ao) x = asf.postArchivalObject(repo, asid, json.dumps(the_ao)) out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)] print(out_row) the_output.append(out_row) # write_sheet.clear() write_sheet.appendData(the_output) quit()
def main(): server = 'Prod' asf.setServer(server) enum_num = 14 # extent_extent_type enumeration extent_data = asf.getEnumeration(enum_num) extent_usage_csv = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-111-extents-cleanup/extent-values-prod3.tsv' output_folder = 'output/enumerations' # Paths for reporting before/after data out_path_old = output_folder + '/' + str(enum_num) + 'PROD_old.json' out_path_new = output_folder + '/' + str(enum_num) + 'PROD_new.json' # Save copy of existing object print('Saving data to ' + out_path_old + '....') with open(out_path_old, "w+") as f: f.write(extent_data) # Load list from csv csv.register_dialect('my_dialect', delimiter='\t', quoting=csv.QUOTE_NONE) data = [] with open(extent_usage_csv) as the_csv_data: for row in csv.reader(the_csv_data, 'my_dialect'): data.append(row) # A list of ids of extent values to remove unused_extents = [x[0] for x in data if x[2] == 'Not used.'] for e in unused_extents: print('suppressing ' + str(e)) # mode='suppress' to suppress, mode='unsuppress' to unsuppress post = asf.suppressEnumerationValue(e, mode='suppress') print(post) extent_data_new = asf.getEnumeration(enum_num) # Save updated object print('Saving data to ' + out_path_new + '....') with open(out_path_new, "w+") as f: f.write(extent_data_new)
def main(): # set to Prod | Dev | Test asf.setServer('Prod') bibid_file = "ead_bibids_20190520.txt" lookup_file = "id_lookup_prod_20190522.csv" outfile_loc = "ead_as_qc_reports/ead_as_qc_xml_PROD1" with open(bibid_file) as f: the_bibids = [line.rstrip('\n') for line in f] the_errors = [] the_processed = [] for a_bibid in the_bibids: print('Processing bibid: ' + a_bibid) if a_bibid: try: the_lookup = asf.lookupByBibID(a_bibid, lookup_file) the_repo = the_lookup[0] the_asid = the_lookup[1] the_processed.append(a_bibid) except: # Can't find in lookup the_repo = 0 the_asid = 0 the_errors.append(a_bibid) if (a_bibid and the_asid != 0): the_ead = asf.getEAD(the_repo, the_asid) the_filepath = outfile_loc + '/' + a_bibid + '_ead.xml' with open(the_filepath, "w") as myfile: myfile.write(the_ead) # Report results print('Processed ' + str(len(the_processed)) + ' records.') if len(the_errors) > 0: print('*** Warning: ' + str(len(the_errors)) + ' errors. Could not process id ' + ', '.join(the_errors) + ' ***')
def fix_agent(asid, agent_type): x = json.loads(asf.getAgent(asid, agent_type=agent_type)) for name in x["names"]: print(name) if name["is_display_name"] == True: name["source"] = "local" name["rules"] = "dacs" x["display_name"]["source"] = "local" x["display_name"]["rules"] = "dacs" return x
def main(): # Main code goes here. asf.setServer("Prod") lookup_csv = "id_lookup_prod.csv" id_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-226-oclc/035s_20200915.txt" # Read a list of bibids and oclc strings the_data = [] with open(id_file) as ids: for row in csv.reader(ids, delimiter="|"): the_data.append([row[0], row[1], row[2]]) for a_row in the_data: bibid = a_row[0] print(bibid) str_2 = a_row[1] str_3 = a_row[2] try: repo, asid = asf.lookupByBibID(bibid, lookup_csv) x = asf.getResource(repo, asid) y = json.loads(x) user_defnd = y["user_defined"] if "user_defined" in y else {} user_defnd["string_2"] = str_2 user_defnd["string_3"] = str_3 print(user_defnd) y["user_defined"] = user_defnd z = json.dumps(y) post = asf.postResource(repo, asid, z) print(post) except Exception as e: print(e + ": Could not lookup " + str(bibid))
def main(): # SERVER = "Test" # test SERVER = "Prod" asf.setServer(SERVER) sheet_id = '1OABHEJF1jqA1vlbW5yTENry5W7YqKlag5nJDJ9ouCzg' # read_sheet = dataSheet(sheet_id, 'Test!A:Z') # Test read_sheet = dataSheet(sheet_id, 'Prod!A:Z') # Test write_sheet = dataSheet(sheet_id, 'output!A:Z') the_refs = read_sheet.getDataColumns()[0] # print(the_refs) the_output = [] for r in the_refs: the_ao = json.loads(asf.getArchivalObjectByRef(2, r)) asid = the_ao['uri'].split('/')[4] old_date = str(the_ao['dates'][0]['begin']) new_ao = fix_begin_date(2, the_ao) new_date = str(new_ao['dates'][0]['begin']) print("asid: " + str(asid)) x = asf.postArchivalObject(2, asid, json.dumps(new_ao)) out_row = [SERVER, r, asid, old_date, new_date, str(x)] # print(out_row) the_output.append(out_row) write_sheet.clear() write_sheet.appendData(the_output) quit() x = fix_begin_date(2, 'b2ec9ce511e4212ebb145fb909ca85bd') print(x) pprint( json.loads( asf.getArchivalObjectByRef(2, 'b2ec9ce511e4212ebb145fb909ca85bd'))) quit()
def add_authority(server, asid, uri, source=None): # function to (1) query subject and determine if it already has # an authority uri, (2) if not, add in the provided URI, # and (3) return a response for reporting. subj = asf.getSubject(asid) if 'authority_id' in subj: print('*** Subject ' + str(asid) + ' already has authority: ' + subj['authority_id'] + ' .... Skiping....') return [server, asid, subj['authority_id'], subj['source'], 'Y'] else: subj['authority_id'] = uri if source is None: source = subj['source'] # If a new source is provided, add it in as well. else: subj['source'] = source try: resp = asf.postSubject(asid, json.dumps(subj)) except json.JSONDecodeError as e: resp = 'JSON ERROR: + str(asid) + :: ' + str(e) except Exception as e: resp = 'ERROR: + str(asid) + :: ' + str(e) print(resp) return [server, asid, uri, str(source), '', str(resp)]
def get_agent_data(name, endpoint, pickle_path): print("Getting agents: " + name) # out_path = os.path.join(my_path, "output/agents_" + i["name"] + ".pickle") # out_path = os.path.join(out_folder, "agents_" + i["name"] + ".pickle") # Get a list of agent ids from API agents_list = json.loads(asf.getResponse(endpoint + "?all_ids=true")) agent_cnt_str = "Number of agents (" + \ name + "): " + str(len(agents_list)) print(agent_cnt_str) log_it(SCRIPT_NAME, agent_cnt_str) agent_data = [] # Loop through agent ids and get full record from API. for cnt, agent in enumerate(agents_list): # print("COUNT: " + str(cnt)) # print("Agent # " + str(agent)) x = asf.getResponse(endpoint + "/" + str(agent)) agent_data.append(json.loads(x)) # Save data as pickle util.pickle_it(agent_data, pickle_path) return agent_data
def harvestBatchEAD(ids_file, lookup_file, out_folder): bibidFile = ids_file lookupFile = lookup_file outFolder = out_folder with open(bibidFile) as f: the_bibids = [line.rstrip('\n') for line in f] the_errors = [] the_processed = [] for a_bibid in the_bibids: print('Processing bibid: ' + a_bibid) if a_bibid: try: the_lookup = asf.lookupByBibID(a_bibid, lookupFile) the_repo = the_lookup[0] the_asid = the_lookup[1] the_processed.append(a_bibid) except: # Can't find in lookup the_repo = 0 the_asid = 0 the_errors.append(a_bibid) # print(the_repo) # print(the_asid) if (a_bibid and the_asid != 0): the_ead = getSingleEAD(the_repo, the_asid) the_filepath = outFolder + '/' + a_bibid + '_ead.xml' with open(the_filepath, "w") as myfile: myfile.write(the_ead) # Report results print('Processed ' + str(len(the_processed)) + ' records.') if len(the_errors) > 0: print('*** Warning: ' + str(len(the_errors)) + ' errors. Could not process id ' + ', '.join(the_errors) + ' ***')
def main(): # Main code goes here. my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) asf.setServer("Prod") the_sheet = dataSheet("1UQm7ffd1Kq4zqlzHZajd9YkwW1_nmOJFS1W7nI-c_Vk", "new-batch!A:Z") output_folder = os.path.join(my_path, "output/resource_collecting_area") the_rows = the_sheet.getData() the_new_rows = [] the_heads = the_rows.pop(0) the_new_rows.append(the_heads) coll_area_index = 8 # the column of collecting area for a_row in the_rows: the_new_row = a_row # print(a_row) coll = "" repo, asid = a_row[0], a_row[1] if len(a_row) >= coll_area_index: # if there is a collecting location to add coll = a_row[coll_area_index] the_resource = asf.getResource(repo, asid) out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json") out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json") # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) the_data = json.loads(the_resource) fix = False if "user_defined" in the_data: the_user_defined = the_data["user_defined"] if "enum_4" in the_user_defined: print("Already has enum_4! Skipping.") else: fix = True the_user_defined["enum_4"] = coll the_data["user_defined"] = the_user_defined the_new_resource = json.dumps(the_data) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(the_new_resource) if fix == True: try: post = "[NONE]" post = asf.postResource(repo, asid, the_new_resource) print(post) except: print("Error: There was a problem posting resource " + str(repo) + ":" + str(asid) + "!") the_new_row.append(coll) else: print("ERROR: No user_defined data in " + str(repo) + ":" + str(asid)) the_new_rows.append(the_new_row) the_sheet.clear() the_sheet.appendData(the_new_rows) # print(the_new_rows) quit()
script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = "1pZk2tPMuZDOd1veOBSJNRk2fprA6p3Qb3WKZDtZay88" the_sheet = dataSheet(sheet_id, "subjects!A:Z") # the_sheet = dataSheet(sheet_id, "test!A:Z") # test now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later # First get the subject records from API (this can take a long time!) asf.setServer("Prod") # AS instance: Prod | Dev | Test # out_path = os.path.join(my_path, "output/subjects.pickle") out_path = "/cul/cul0/ldpd/archivesspace/subjects/subjects.pickle" # uncomment to do the full download. the_subjects = asf.getSubjects() util.pickle_it(the_subjects, out_path) # Report the saved data to Google Sheet # List of fields to extract, expressed as dpaths. the_fields = [ ["uri", "uri"], ["title", "title"], ["source", "source"],
def main(): # Set to Test | Dev | Prod asf.setServer('Prod') the_report_sheet = dataSheet( '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'daos-publish!A:Z') # Set value to switch to, publish (True) or unpublish (False) publish_value = True # id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-mitchell.csv' id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-kay.csv' output_folder = 'output/daos-publish' # Read a list of repo and object ids (csv) the_ids = [] ids = open(id_file) for row in csv.reader(ids): the_ids.append([row[0], row[1]]) ids.close() the_before_afters = [] the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after'] the_before_afters.append(the_heads) for an_obj in the_ids: out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[ 1] + '_old.json' # read from API # try: x = asf.getDigitalObjectFromParent(an_obj[0], an_obj[1]) # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(x) f.close() x = json.loads(x) # the_old_field_data = x['file_versions'][0]['file_uri'] the_old_field_data = x['publish'] asid = str( x['uri'].split('/')[-1]) # get the asid from the uri string. title = x['title'] repo = str(an_obj[0]) y = x # Here set the desired value y['publish'] = publish_value if y['publish'] == the_old_field_data: the_new_field_data = "[no change]" else: the_new_field_data = y['publish'] the_before_afters.append([ an_obj[0], asid, an_obj[1], title, the_old_field_data, the_new_field_data ]) # convert dict back to json for posting. z = json.dumps(y) # Post the fixed object back to API. # (Comment these out for testing.) if the_new_field_data != "[no change]": resp = asf.postDigitalObject(repo, asid, z) print(resp) else: print('No update: skipping record.') # except: # print('Could not retrieve record ' + str(an_obj[1])) # Report changes to Google Sheet print('Writing before/after info to sheet...') the_report_sheet.clear() the_report_sheet.appendData(the_before_afters) print("Done!") quit()
def main(): asf.setServer('Prod') # AS instance: Prod | Dev | Test mode = 'Prod' # Prod or Test my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = '' # set later # today_str = str(date.today().strftime("%Y%m%d")) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d")) ######################## ### PROCESS OAI DATA ### ######################## # Set path to Saxon processor # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar") # XSLT file to generate report marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl') if mode == 'Prod': # OAI XML file to use as source # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest' source_dir = '/cul/cul0/ldpd/archivesspace/oai' sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY' oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml' else: # TEST yest_str = "20190915" # OAI XML file to use as source source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912' # local test sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ' oai_file = yest_str + '.asAllRaw.xml' the_sheets = { 'oai': dataSheet(sheet_id, 'oai!A:Z'), 'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'), 'log': dataSheet(sheet_id, 'log!A:Z') } the_outpath = os.path.join(my_path, 'output/' + yest_str + '.marc_reporter_out.xml') print(' ') # Copy oai current data to oai_last sheet for diff the_old_data = the_sheets['oai'].getData() the_sheets['oai_last'].clear() the_sheets['oai_last'].appendData(the_old_data) # Process OAI MARC and output to CSV util.saxon_process(oai_file, marc_xslt_file, the_outpath) # clear data from "new" sheet the_sheets['oai'].clear() # Send result csv to Google Sheet. y = the_sheets['oai'].importCSV(the_outpath, delim='|') print(' ') ######################## ### PROCESS UNPUBLISHED ### ######################## print('Finding unpublished records...') the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'json' ] the_heads = [ 'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME', 'SYSTEM_MTIME', 'LAST_MODIFIED_BY' ] unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z') the_unpublished = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getUnpublished(r, filter='resources', fields=the_fields) # print(x) for a in x: row = [a[v] for v in the_fields] # print(row) my_json = json.loads(row.pop(6)) try: call_no = my_json['user_defined']['string_1'] except: call_no = '' # get the repo from the uri string. repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the asid from the uri string. asid = int(str(row[0].split('/')[-1]).rstrip()) row.pop(0) row.insert(0, asid), row.insert(0, repo_id) if 'UA' in call_no: repo = 'nnc-ua' else: repo = get_repo(repo_id) row.insert(0, repo) the_unpublished.append(row) print('Repo ' + str(r) + ': ' + str(len(x))) # print('Total unpublished: ' + str(len(the_unpublished))) msg = 'Total unpublished: ' + str(len(the_unpublished)) print(msg) digester.post_digest(script_name, msg) # Test unpubs_sheet.clear() unpubs_sheet.appendData([the_heads]) unpubs_sheet.appendData(the_unpublished) ######################## ### GET NEWLY CREATED ### ######################## data_data = [{ 'range': 'resource-changes!A:Z', 'filter': 'resources' }, { 'range': 'accession-changes!A:Z', 'filter': 'accessions' }] for d in data_data: print('processing ' + d['filter']) the_delta_sheet = dataSheet(sheet_id, d['range']) the_date = yest_str # the_date = '2019-08-27' the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_heads = [ 'repo', 'asid', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_modifieds = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getByDate(r, the_date, date_type='ctime', comparator='equal', filter=d['filter'], fields=the_fields) for a in x: row = [a[v] for v in the_fields] # print(row) # get the repo from the uri string. repo = str(row[0].split('/')[-3]).rstrip() # get the asid from the uri string. asid = str(row[0].split('/')[-1]).rstrip() row.pop(0) row.insert(0, asid), row.insert(0, repo) the_modifieds.append(row) # print(list(a.values())) # the_modifieds.append(list(a.values())) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total ' + d['filter'] + ': ' + str(len(the_modifieds))) digester.post_digest(script_name, 'New ' + d['filter'] + ': ' + str(len(the_modifieds))) # Test # the_sheet.clear() # the_sheet.appendData([the_fields]) the_delta_sheet.appendData(the_modifieds) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \ '. Finished: ' + end_time + ' (duration: ' + my_duration + ').' the_sheets['log'].appendData([[the_log]]) print(' ') print(the_log) digester.post_digest(script_name, the_log) # Test print(' ') print('Script done. Updated data is available at ' + the_sheets['oai'].url)
def main(): # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) if DEBUG is True: sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8" # test else: sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE" now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later # First get the agent records from API (this can take a long time!) asf.setServer("Prod") # AS instance: Prod | Dev | Test if DEBUG is True: out_folder = "/cul/cul0/ldpd/archivesspace/test/agents" else: out_folder = "/cul/cul0/ldpd/archivesspace/agents" family_agents_file = os.path.join(out_folder, "agents_families.pickle") corp_agents_file = os.path.join(out_folder, "agents_corporate.pickle") persons_agents_file = os.path.join(out_folder, "agents_persons.pickle") the_info = [ { "name": "families", "endpoint": "/agents/families", "sheet": dataSheet(sheet_id, "families!A:Z"), "pickle": family_agents_file }, { "name": "corporate", "endpoint": "/agents/corporate_entities", "sheet": dataSheet(sheet_id, "corporate!A:Z"), "pickle": corp_agents_file }, { "name": "persons", "endpoint": "/agents/people", "sheet": dataSheet(sheet_id, "persons!A:Z"), "pickle": persons_agents_file }, ] # List of fields to extract, expressed as dpaths. the_fields = [ ["uri", "uri"], ["title", "title"], ["source", "names/0/source"], ["authority_id", "names/0/authority_id"], ["is_linked_to_published_record", "is_linked_to_published_record"], ["publish", "publish"], ["last_modified_by", "last_modified_by"], ["last_modified", "system_mtime"], ] the_record_cnts = {} if DEBUG is True: print("*** (DEBUG MODE) ***") for i in the_info: print("Getting agents: " + i["name"]) agent_data = get_agent_data(i["name"], i["endpoint"], i["pickle"]) print(" ") # Report the saved data to Google Sheet the_sheet = i["sheet"] the_heads = [x[0] for x in the_fields] the_output = [the_heads] the_record_cnts[i["name"]] = str(len(agent_data)) for agent in agent_data: the_row = [] # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(agent, af[1])) except: d = "" the_row.append(d) # print(the_row) the_output.append(the_row) the_sheet.clear() save = the_sheet.appendData(the_output) print(save) # Generate log print(the_record_cnts) print(" ".join(the_record_cnts)) cnt_str = "".join(k + "=" + v + ". " for k, v in the_record_cnts.items()) # print(cnt_str) now2 = datetime.datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + MY_NAME + ". " + cnt_str + " Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") log_range = "log!A:A" log_sheet = dataSheet(sheet_id, log_range) log_sheet.appendData([[the_log]]) print(" ") print(the_log) log_it(SCRIPT_NAME, the_log) # digester.post_digest(SCRIPT_NAME, the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ "https://docs.google.com/spreadsheets/d/" + \ str(sheet_id) + "/edit?usp=sharing" print(exit_msg) log_it(SCRIPT_NAME, exit_msg) quit()
# Automated reporting of ArchivesSpace accessions info. import ASFunctions as asf import json from pprint import pprint from sheetFeeder import dataSheet from operator import itemgetter import datetime import re import os.path import dateutil.parser import digester # for generating composite digest of report info. # set Prod | Dev | Test target_server = 'Prod' # Prod | Dev | Test asf.setServer(target_server) DEBUG = False # mode = 'Prod' # Prod or Test MY_NAME = __file__ SCRIPT_NAME = os.path.basename(MY_NAME) # This makes sure the script can be run from any working directory and still find related files. MY_PATH = os.path.dirname(__file__) # File to use to lookup bibids LOOKUP_CSV = os.path.join(MY_PATH, "id_lookup_prod.csv") def main():
# Script to add authorities or make other changes to subjects. See ACFA-287. import ASFunctions as asf import json from pprint import pprint from sheetFeeder import dataSheet import os.path SERVER = 'Prod' asf.setServer(SERVER) my_name = __file__ # pprint(asf.getSubject(11453)) # quit() # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = '1b-dFdOaWD7AEqzhK0uuGXkonum6wX8Zcriq8-G4l33Q' # list_sheet = dataSheet(sheet_id, 'Test!A:Z') # test list_sheet = dataSheet(sheet_id, 'batch!A:Z') report_sheet = dataSheet(sheet_id, 'output!A:Z') def add_authority(server, asid, uri, source=None): # function to (1) query subject and determine if it already has # an authority uri, (2) if not, add in the provided URI,
def main(): # Main code goes here. asf.setServer("Prod") on_site = False # set to True to get on-site note, False to get off-site note. See the_access_note var below. output_folder = "output/resource_on-site_access" lookup_csv = "id_lookup_prod.csv" # bibid_file = ( # "/Users/dwh2128/Documents/ACFA/TEST/ACFA-224-onsite-notes/acfa-224-list_3.csv" # ) bibid_file = ( "/Users/dwh2128/Documents/ACFA/TEST/ACFA-243-off-site/acfa-243_off-site.csv" ) # Read a list of bibids (csv) the_bibids = [] with open(bibid_file) as ids: for row in csv.reader(ids): the_bibids.append(row[0]) if on_site == True: the_access_note = { "jsonmodel_type": "note_multipart", "label": "Restrictions on Access", "type": "accessrestrict", "rights_restriction": {"local_access_restriction_type": []}, "subnotes": [ { "jsonmodel_type": "note_text", "content": "This collection is located on-site.", "publish": True, } ], "publish": True, } else: the_access_note = { "jsonmodel_type": "note_multipart", "label": "Restrictions on Access", "type": "accessrestrict", "rights_restriction": {"local_access_restriction_type": []}, "subnotes": [ { "jsonmodel_type": "note_text", "content": "This collection is located off-site. You will need to request this material at least three business days in advance to use the collection in the Rare Book and Manuscript Library reading room.", "publish": True, } ], "publish": True, } for bib in the_bibids: try: repo, asid = asf.lookupByBibID(bib, lookup_csv) except: print("Error: No record found for " + str(bib) + ". Skipping...") continue out_path_old = output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json" out_path_new = output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json" the_resource = asf.getResource(repo, asid) # Save copy of existing object print("Saving data to " + out_path_old + "....") with open(out_path_old, "w+") as f: f.write(the_resource) the_data = json.loads(the_resource) # Test if there is already an access restriction note. has_note = False for a_note in the_data["notes"]: try: if a_note["type"] == "accessrestrict": has_note = True except KeyError: print("Note has no type -- skipping.") if has_note == True: print(str(bib) + " - Warning: Already has access note.") # else: the_data["notes"].append(the_access_note) the_new_resource = json.dumps(the_data) # Save copy of new object print("Saving data to " + out_path_new + "....") with open(out_path_new, "w+") as f: f.write(the_new_resource) try: post = asf.postResource(repo, asid, the_new_resource) print(post) except: print( "Error: There was a problem posting resource " + str(repo) + ":" + str(asid) + "!" ) quit()
logging.basicConfig(level=logging.ERROR) # not doing anything with this yet... # logging.debug('¥¥¥¥¥¥ This is a debug message') # logging.info('¥¥¥¥¥¥ This is an info message') # logging.warning('¥¥¥¥¥¥ This is a warning message') # logging.error('¥¥¥¥¥¥ This is an error message') # logging.critical('¥¥¥¥¥¥ This is a critical message') my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) asf.setServer('Prod') print('THIS IS A TEST -- IGNORE!') print(' ') print('testing google sheet api...') # The ID and range of a sample spreadsheet. the_sheet = dataSheet( '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ', 'Sheet1!A:Z') # the_sheet = dataSheet('1YzM1oTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ','Sheet1!A:Z') print(the_sheet.getData())
# Script to get barcode and holding info from spreadsheet # and add to top containers in ArchivesSpace via API. See ACFA-206. import ASFunctions as asf import json from pprint import pprint from sheetFeeder import dataSheet import dcps_utils as util import os.path import csv import datetime asf.setServer('Prod') my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) # sheet_id = '1gUx1cPS8POLxqRblYIs1vlpr7yDGOyHmAJqpl6nMo4k' sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0' # test # list_sheet = dataSheet(sheet_id, 'report!A:Z') list_sheet = dataSheet(sheet_id, 'test!A:Z') # test the_data = list_sheet.getData() the_heads = the_data.pop(0) today = datetime.date.today().strftime("%Y-%m-%d")
def main(): # set to True to use test sheet and test json folder location. debug = False asf.setServer("Prod") my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = "" # set later today_str = str((date.today()).strftime("%Y%m%d")) if debug: print("[Running script in debug mode...]") parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources" # test folder sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U" # test sheet the_repos = [4] # to test else: parent_folder = "/cul/cul0/ldpd/archivesspace/resources" sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg" the_repos = [2, 3, 4, 5, 6] output_folder = parent_folder + "/" + today_str the_sheets = { "resources": dataSheet(sheet_id, "Resources!A:Z"), "cm": dataSheet(sheet_id, "Collection Management!A:Z"), "log": dataSheet(sheet_id, "log!A:Z"), } # Set number of chars to truncate the scope and bioghist notes. trunc_len = 400 # List of fields to extract, expressed as dpaths. the_fields = [ ["bibid", "/id_0"], ["title", "/title"], ["published", "/publish"], ["create_time", "/create_time"], ["system_mtime", "/system_mtime"], ["created_by", "/created_by"], ["last_modified_by", "/last_modified_by"], ["ead_location", "/ead_location"], ["ext_number", "/extents/0/number"], ["ext_portion", "/extents/0/portion"], ["ext_type", "/extents/0/extent_type"], # ["integer_1", "/user_defined/integer_1"], # ["integer_2", "/user_defined/integer_2"], # ["integer_3", "/user_defined/integer_3"], ["local call no.", "/user_defined/string_1"], ["other ctrl no. 1", "/user_defined/string_2"], ["other ctrl no. 2", "/user_defined/string_3"], ["other ctrl no. 3", "/user_defined/string_4"], # ["enum_1", "/user_defined/enum_1"], # ["enum_2", "/user_defined/enum_2"], ["description status", "/user_defined/enum_3"], ["collecting area", "/user_defined/enum_4"], ["level", "level"] # (Scope and bioghist notes are added in separately below.) ] # Get the collection management records for use in report. the_cms = [] fields = [ "id", "parent_id", "title", "system_mtime", "processing_priority", "processing_status", ] print(" ") print("*** Retrieve Collection Management Data ***") print(" ") for r in the_repos: print("Getting collection management records for repo: " + str(r) + "...") cm = asf.getCollectionManagements(r, filter="resource", fields=fields) for c in cm: row = [c[f] for f in fields] the_cms.append(row) # a data set of collection managment records to post to sheet below. the_cms.insert(0, fields) print(" ") print("*** Retrieve Resource Data ***") print(" ") # Get the list of resources for each repo and add to the_ids the_ids = [] for r in the_repos: print("Getting ids for repo: " + str(r) + "...") asids = json.loads( asf.getResponse("/repositories/" + str(r) + "/resources?all_ids=true")) print(str(len(asids)) + " records found in repo " + str(r) + ".") for i in asids: the_ids.append([r, i]) # Construct the head row the_heads = [x[0] for x in the_fields] the_heads.insert(0, "asid") the_heads.insert(0, "repo") the_heads.append("scope note") the_heads.append("scopenote length") the_heads.append("bioghist note") the_heads.append("biognote length") the_output = [the_heads] # Fetch the resources from the ids print("Downloading resources...") if not os.path.exists(output_folder): print("Creating directory " + output_folder + "...") os.makedirs(output_folder) for repo, asid in the_ids: # print("Processsing " + str(repo) + ":" + str(asid) + "...") the_row = [repo, asid] res_json = asf.getResource(repo, asid) res_dict = json.loads(res_json) out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json" # Write the JSON to file. with open(out_path, "w+") as f: f.write(res_json) # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(res_dict, af[1])) except: d = "" the_row.append(d) # Process scope and bioghist notes the_notes = dpath.util.values(res_dict, "notes/*", afilter=None) the_scope_notes = [] the_biog_notes = [] for a_note in the_notes: try: if a_note["type"] == "scopecontent": the_scope_notes.append(a_note) except: pass try: if a_note["type"] == "bioghist": the_biog_notes.append(a_note) except: pass if the_scope_notes: # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars. scope_note_texts = [ s["subnotes"][0]["content"] for s in the_scope_notes ] the_scope_text = " ".join(scope_note_texts) scope_note_len = len(the_scope_text) scope_note_short = truncate_str(the_scope_text, length=trunc_len) else: scope_note_short = "" scope_note_len = 0 if the_biog_notes: # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars. biog_note_texts = [ s["subnotes"][0]["content"] for s in the_biog_notes ] the_biog_text = " ".join(biog_note_texts) biog_note_len = len(the_biog_text) biog_note_short = truncate_str(the_biog_text, length=trunc_len) else: biog_note_short = "" biog_note_len = 0 the_row.append(scope_note_short) the_row.append(str(scope_note_len)) the_row.append(biog_note_short) the_row.append(str(biog_note_len)) the_output.append(the_row) # Zip up the JSON files for storage. zip_out = make_archive(today_str, "zip", root_dir=parent_folder, base_dir=today_str) print(zip_out) # Zip is saved in working dir; move to correct location. print("Saving zip file " + str(today_str) + ".zip to " + parent_folder) # Test if file already exists. if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"): print("File " + parent_folder + "/" + str(today_str) + ".zip exists already. Replacing with new zip file...") os.remove(parent_folder + "/" + str(today_str) + ".zip") move(zip_out, parent_folder) # Remove the json folder once zip is in place. rmtree(parent_folder + "/" + today_str) util.file_cleanup(parent_folder, 60) # Write output to Google sheet. print(" ") print("*** Writing Data to Report ***") print(" ") the_sheets["cm"].clear() the_sheets["cm"].appendData(the_cms) digester.post_digest( script_name, "Total collection management records: " + str(len(the_cms) - 1)) the_sheets["resources"].clear() the_sheets["resources"].appendData(the_output) digester.post_digest( script_name, "Total number of resource records: " + str(len(the_output) - 1)) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + my_name + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") the_sheets["log"].appendData([[the_log]]) print(" ") print(the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ the_sheets["resources"].url print(exit_msg) digester.post_digest(script_name, exit_msg)
def main(): now1 = datetime.datetime.now() start_time = str(now1) end_time = '' # set later # day_offset = now1.weekday() + 1 # Calculate the Sunday of current week day_offset = 7 # use past seven days, regardless of current day print('Script ' + MY_NAME + ' begun at ' + start_time + '. ') if not DEBUG: the_sheet_id = '1JA5bRSnYV80sx4m5SOFQ6QJ4u21SXvQeNdNbuRVCdds' else: the_sheet_id = '1e_TAK8eUsaHltBu9J5bNO1twThqt7_nE5olmz2pdCUw' # test doc day_offset = 14 # use past 2 weeks for testing # Set date stamp of start of week (Sunday) to determine recently created accessions. begin_of_week = (now1 - datetime.timedelta(day_offset)).date() the_sheet_rbml = dataSheet(the_sheet_id, 'rbml!A:Z') the_sheet_avery = dataSheet(the_sheet_id, 'avery!A:Z') the_sheet_rbmlbooks = dataSheet(the_sheet_id, 'rbmlbooks!A:Z') # Location to save output if DEBUG is True: out_folder = "/cul/cul0/ldpd/archivesspace/test/accessions" else: out_folder = "/cul/cul0/ldpd/archivesspace/accessions" rbml_acc_file = os.path.join(out_folder, 'report_rbml_accessions.json') avery_acc_file = os.path.join(out_folder, 'report_avery_accessions.json') rbmlbooks_acc_file = os.path.join(out_folder, 'report_rbmlbooks_accessions.json') print(' ') print('Starting accession report in ' + 'https://docs.google.com/spreadsheets/d/' + str(the_sheet_id) + '/edit?usp=sharing') if not DEBUG: # Save the accessions as json files. In DEBUG mode, just use the files already saved. print('Saving Avery accession data to ' + avery_acc_file + '....') # Only fetch file if not in Debug mode with open(avery_acc_file, "w+") as f: try: x = asf.getAccessions(3) f.write(x) except: raise ValueError( "There was an error in getting Avery accession data!") y = json.loads(x) if 'error' in y[0]: print(y[0]['error']) print('Saving RBML accession data to ' + rbml_acc_file + '....') with open(rbml_acc_file, "w+") as f: try: x = asf.getAccessions(2) f.write(x) except: raise ValueError( "There was an error in getting RBML accession data!") y = json.loads(x) if 'error' in y[0]: print(y[0]['error']) print('Saving RBMLBOOKS accession data to ' + rbmlbooks_acc_file + '....') with open(rbmlbooks_acc_file, "w+") as f: try: x = asf.getAccessions(6) f.write(x) except: raise ValueError( "There was an error in getting RBMLBOOKS accession data!") y = json.loads(x) if 'error' in y[0]: print(y[0]['error']) print(' ') # the_files = [ # [avery_acc_file, the_sheet_avery], # [rbml_acc_file, the_sheet_rbml] # ] the_recents = {} the_info = [{ 'repo_name': 'Avery', 'repo_id': 3, 'acc_file': avery_acc_file, 'the_sheet': the_sheet_avery }, { 'repo_name': 'RBML', 'repo_id': 2, 'acc_file': rbml_acc_file, 'the_sheet': the_sheet_rbml }, { 'repo_name': 'RBMLBOOKS', 'repo_id': 6, 'acc_file': rbmlbooks_acc_file, 'the_sheet': the_sheet_rbmlbooks }] # The top-level elements to save from the JSON (each can be further processed below) the_keys = { "title": "title", "uri": "uri", "repository": "repository", "accession_date": "accession_date", "id_0": "id_0", "id_1": "id_1", "id_2": "id_2", "id_3": "id_3", "extents": "extents", "related_resources": "related_resources", "collection_management": "collection_management", "user_defined": "user_defined", "create_time": "create_time", "system_mtime": "system_mtime", "last_modified_by": "last_modified_by" } ext_dict = { "ext-number": "number", "ext-portion": "portion", "ext-type": "extent_type" } for f in the_info: the_file = f['acc_file'] the_target = f['the_sheet'] repo_name = f['repo_name'] with open(the_file) as f: the_data = json.load(f) all_rows = [] for an_accession in the_data: # acc_info : prelim dict for each accession. Do things to it. acc_info = {} for key, value in the_keys.items(): try: acc_info.update({key: an_accession[value]}) except (IndexError, KeyError): acc_info.update({key: ""}) # Refine elements by extracting subelements, etc. # Handle collection_management cm = acc_info["collection_management"] cm_dict = { "processing_priority": "processing_priority", "processing_status": "processing_status" } for key, value in cm_dict.items(): try: acc_info[key] = cm[value] except (IndexError, KeyError, TypeError): acc_info[key] = '' acc_info.pop("collection_management") # Parse resource id and get bibid res = acc_info["related_resources"] if len(res) > 0: res_url = res[0]["ref"] repo = res_url.split('/')[2] asid = res_url.split('/')[4] bibid = asf.lookupBibID(repo, asid, LOOKUP_CSV) else: bibid = '' asid = '' acc_info["resource_bibid"] = bibid acc_info["resource_asid"] = asid acc_info.pop("related_resources") # Parse BibID out of user_defined / integer_1 try: usdef = acc_info["user_defined"] acc_info['integer_1'] = usdef['integer_1'] except: acc_info['integer_1'] = '' acc_info.pop("user_defined") # Fix problem with leading "+" in id_3 (add apostrophe for display) acc_info["id_3"] = re.sub(r"^\+", "'+", acc_info["id_3"]) # Handle repository repository = acc_info["repository"] if len(repository) > 0: repo_url = repository["ref"] repo = repo_url.split('/')[2] else: repo = '' acc_info["repo"] = repo acc_info.pop("repository") # Handle date acc_date = acc_info["accession_date"] yyyy = int(acc_date.split('-')[0]) mm = int(acc_date.split('-')[1]) dd = int(acc_date.split('-')[2]) the_date = datetime.date(yyyy, mm, dd) # due to legacy import issue, some with unknown dates have malformed dates like 0002-01-23. Acknowledge their unknownness. if the_date.year < 1700: acc_info["accession_date"] = "0000-00-00" acc_info["year"] = "" else: acc_info["year"] = the_date.year # Fiscal year if the_date.year < 1700: acc_info["fiscal-year"] = "" else: if the_date.month > 6: acc_info["fiscal-year"] = the_date.year + 1 else: acc_info["fiscal-year"] = the_date.year # Handle extents ext = acc_info["extents"] for key, value in ext_dict.items(): try: acc_info[key] = ext[0][value] except (IndexError, KeyError): acc_info[key] = '' acc_info.pop("extents") # Clean up titles acc_info['title'] = str(acc_info['title']).strip() # Uncomment to list records in log. # print("processing: " + str(acc_info["uri"]).strip() + ' / ' + str(acc_info["title"]).strip() ) all_rows.append(acc_info) processed_msg = 'Processed ' + \ str(len(all_rows)) + ' records in ' + repo_name + '.' print(processed_msg) log_it(SCRIPT_NAME, processed_msg) # the_heads = list(all_rows[0].keys()) # explicitly order the columns, as dict order is unpredictable. the_heads = [ 'title', 'uri', 'accession_date', 'id_0', 'id_1', 'id_2', 'id_3', 'integer_1', 'resource_bibid', 'resource_asid', 'repo', 'year', 'fiscal-year', 'ext-number', 'ext-portion', 'ext-type', 'processing_priority', 'processing_status', 'create_time', 'system_mtime', 'last_modified_by' ] the_output = [] # Build row in order specified by the_heads for a_row in all_rows: # r = list(a_row.values()) r = [a_row[h] for h in the_heads] the_output.append(r) # print(a_row) # sort by accession_date (the 2nd item in inner lists) the_output = sorted(the_output, key=itemgetter(2), reverse=True) # Get list of recents the_recents[repo_name] = [] for i in the_output: # i[18] = the create date column i_date = dateutil.parser.isoparse(i[18]).date() if i_date > begin_of_week: the_recents[repo_name].append(i) # If there are recents, list them if the_recents[repo_name]: print(' ') recent_msg = str(len(the_recents[repo_name])) + \ ' accessions recently added in ' + repo_name + ': ' print(recent_msg) log_it(SCRIPT_NAME, recent_msg) print('-----------') for r in the_recents[repo_name]: print(r[0]) print(r[1]) print('Created ' + str(dateutil.parser.isoparse(r[18]).date())) print('Last edited by ' + r[20]) print('-----------') else: print(' ') recent_msg = 'No recently created accessions in ' + repo_name print(recent_msg) log_it(SCRIPT_NAME, recent_msg) # print(the_recents[repo_name]) the_output.insert(0, the_heads) print(' ') the_target.clear() print('Writing ' + repo_name + ' data to sheet ...') the_target.appendData(the_output) print(' ') # generate log and add to log tab, if exists. the_tabs = the_target.initTabs now2 = datetime.datetime.now() end_time = str(now2) my_duration = str(now2 - now1) if DEBUG is True: the_log = '[TEST] Data imported from ' + target_server + ' by ' + MY_NAME + '. Start: ' + \ start_time + '. Finished: ' + end_time + \ ' (duration: ' + my_duration + ').' else: the_log = 'Data imported from ' + target_server + ' by ' + MY_NAME + '. Start: ' + \ start_time + '. Finished: ' + end_time + \ ' (duration: ' + my_duration + ').' if 'log' in the_tabs: log_range = 'log!A:A' # today = datetime.datetime.today().strftime('%c') dataSheet(the_sheet_id, log_range).appendData([[the_log]]) else: print('*** Warning: There is no log tab in this sheet. ***') print(' ') print(the_log) log_it(SCRIPT_NAME, the_log) print(' ') exit_msg = 'Script done. Updated data is available at ' + \ 'https://docs.google.com/spreadsheets/d/' + \ str(the_sheet_id) + '/edit?usp=sharing' print(exit_msg) log_it(SCRIPT_NAME, exit_msg)
def main(): my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = '13OakaS0KHtxcaV9HGWDP9Zfnz9TVJR_9zGUnKrb90jk' # test # sheet_id = '1tYOXSDFlkbX_revB_ULvhmCdvKkyzpipBTkYqYXcM38' # sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0' # test # sheet_id = '1OhgJ4g-SWbmnms4b3ppe_0rBT7hz9jfQp6P8mADcatk' # batch template doc container_sheet = dataSheet(sheet_id, 'containers!A:Z') marc_sheet = dataSheet(sheet_id, 'marc!A:Z') # Get a list of bibids from the Marc tab. # the_bibids = marc_sheet.getDataColumns()[0] the_bibids = marc_sheet.getDataColumns()[1] the_bibids.pop(0) the_bibids = list(set(the_bibids)) print(the_bibids) #### TOP CONTAINERS #### the_heads = [ 'bibid', 'resource', 'uri', 'type', 'display_string', 'concat' ] the_rows = [the_heads] lookup_csv = os.path.join(my_path, 'id_lookup_prod.csv') for abib in the_bibids: print(abib) # Get repo and asid from bibid repo, asid = asf.lookupByBibID(abib, lookup_csv) print('Getting top containers for ' + str(repo) + ':' + str(asid)) the_query = '/repositories/' + \ str(repo) + '/resources/' + str(asid) + '/top_containers' # list of top containers the_refs = json.loads(asf.getResponse(the_query)) print(the_refs) cnt = 0 for r in the_refs: cnt += 1 print(cnt) try: tc = json.loads(asf.getResponse(r['ref'])) # print(tc) try: bibid = tc['collection'][0]['identifier'] except: bibid = '' try: resource = tc['collection'][0]['ref'] except: resource = '' try: uri = tc['uri'] except: uri = '' try: type = tc['type'] except: type = '' try: display_string = tc['display_string'] except: display_string = '' try: concat_str = str(tc['display_string'] + ' (' + uri.split('/')[4]) + ')' except: concat_str = 'x' a_row = [ bibid, resource, uri, type, display_string, concat_str ] # print(a_row) the_rows.append(a_row) except: print(r) # Write results to google sheet container_sheet.clear() z = container_sheet.appendData(the_rows) print(z)
tree = et.parse(id_xml) root = tree.getroot() the_recs = root.findall('record') the_ids = [] for a_rec in the_recs: i = a_rec.xpath('identifier/text()') asid = str( i[0].split('/')[-1]).rstrip() # get the asid from the uri string. repo = str( i[0].split('/')[-3]).rstrip() # get the repo from the uri string. bibid = a_rec.xpath('bibid/text()')[0] the_ids.append([repo, asid, bibid]) for x in the_ids: the_ead = asf.getEAD(x[0], x[1]) out_path = output_folder + '/' + str(x[2]) + '_out.xml' # Save copy of existing object print('Saving data to ' + out_path + '....') f = open(out_path, "w+") f.write(the_ead) f.close()
my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE" # sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8" # test now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later # First get the agent records from API (this can take a long time!) asf.setServer("Prod") # AS instance: Prod | Dev | Test the_info = [ { "name": "families", "endpoint": "/agents/families", }, { "name": "corporate", "endpoint": "/agents/corporate_entities", }, { "name": "persons", "endpoint": "/agents/people", }, ]