def main(): my_name = __file__ script_name = os.path.basename(my_name) saxon_path = '../../resources/saxon-9.8.0.12-he.jar' in_path = '~/ohac_marc.xml' xsl_path = './oral2solr.xsl' out_path = '~/ohac_marc_OUT.xml' digester.post_digest(script_name, "THIS IS A TEST") print(acfa.run_saxon(saxon_path, in_path, xsl_path, out_path))
def ohac_extract(): # extract_script_path = '/cul/cul0/ldpd/ccoh/fetchOralHistoryRecords' extract_script_path = os.path.join(my_path, './fetchOralHistoryRecords') # marc_output_path = '/cul/cul0/ldpd/archival_data/test/marc/ohac_marc.xml' # Test marc_output_path = '/cul/cul0/ldpd/archival_data/marc/oral_history_portal/ohac_marc.xml' # marc_output_clean_path = '/cul/cul0/ldpd/archival_data/test/marc/ohac_marc_clean.xml' # Test marc_output_clean_path = '/cul/cul0/ldpd/archival_data/marc/oral_history_portal/ohac_marc_clean.xml' # solr_output_path = '/cul/cul0/ldpd/archival_data/test/solr/ohac_solr.xml' # Test solr_output_path = '/cul/cul0/ldpd/archival_data/solr/ohac_solr.xml' # saxon_path = os.environ['HOME'] + '/lib/saxon-9he.jar' saxon_path = '/opt/dcps/resources/saxon-9.8.0.12-he.jar' xslt_path = os.path.join(my_path, 'oral2solr.xsl') # remove existing file so fetchOralHistoryRecords won't fail. if os.path.exists(marc_output_path): print("Removing old file at " + marc_output_path) os.remove(marc_output_path) the_shell_command = extract_script_path + ' --output ' + marc_output_path print('Extracting OHAC MARC data from Voyager...') res = acfa.run_bash(the_shell_command) # print(res) if reporting: digester.post_digest(script_name, res) # reporting # Do regex to remove some illegal characters. See ACFA-270. res = acfa.sanitize_xml(marc_output_path, marc_output_clean_path) if res: print(res) if reporting: digester.post_digest(script_name, res) # reporting print('Transforming MARC to SOLR XML...') response = acfa.run_saxon(saxon_path, marc_output_clean_path, xslt_path, solr_output_path) print(response) if "ERROR" in response: return [] else: return [solr_output_path]
def main(): solr_index_envs = [] if len(sys.argv) > 1: solr_index_envs = sys.argv[1].split(',') else: # Exit because there was no argument dev|test|prod. sys.exit("Error: No solr_index_env argument(s) provided!") # Only turn on digest reporting if running on Prod. global reporting if 'prod' in solr_index_envs: print("Reporting == True!") reporting = True solr_update_urls = [ "http://ldpd-solr-" + solr_index_env + "1.cul.columbia.edu:8983/solr/archives_portal/update" for solr_index_env in solr_index_envs ] for solr_xml_path in archival_collections_extract(): for solr_update_url in solr_update_urls: acfa.run_post(solr_xml_path, solr_update_url) for solr_xml_path in ohac_extract(): for solr_update_url in solr_update_urls: acfa.run_post(solr_xml_path, solr_update_url) my_path = os.path.dirname(__file__) commit_xml_path = os.path.join(my_path, 'commit.xml') delete_xml_path = os.path.join(my_path, 'delete-delta.xml') for solr_update_url in solr_update_urls: # commit the document add/updates acfa.run_post(commit_xml_path, solr_update_url) # delete everything that wasn't added/updated in this job acfa.run_post(delete_xml_path, solr_update_url) # commit the deletes acfa.run_post(commit_xml_path, solr_update_url) if reporting: digester.post_digest( script_name, script_name + ' completed at ' + str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S')) + '.')
def archival_collections_extract(): # marc_data_folder = '/cul/cul0/ldpd/archival_data/test/marc/archives_portal' # solr_output_folder = '/cul/cul0/ldpd/archival_data/test/solr' marc_data_folder = '/cul/cul0/ldpd/archival_data/marc/archives_portal' solr_output_folder = '/cul/cul0/ldpd/archival_data/solr' # saxon_path = os.environ['HOME'] + '/lib/saxon-9he.jar' saxon_path = '/opt/dcps/resources/saxon-9.8.0.12-he.jar' xslt_path = os.path.join(my_path, 'marc2solr.xsl') the_repos = [{ 'data_file': 'AV.xml', 'clean_file': 'AV_clean.xml', 'repo_id': 'nnc-a' }, { 'data_file': 'EA.xml', 'clean_file': 'EA_clean.xml', 'repo_id': 'nnc-ea' }, { 'data_file': 'HS.xml', 'clean_file': 'HS_clean.xml', 'repo_id': 'nnc-m' }, { 'data_file': 'CCOH.xml', 'clean_file': 'CCOH_clean.xml', 'repo_id': 'nnc-ccoh' }, { 'data_file': 'RB.xml', 'clean_file': 'RB_clean.xml', 'repo_id': 'nnc-rb' }, { 'data_file': 'UA.xml', 'clean_file': 'UA_clean.xml', 'repo_id': 'nnc-ua' }, { 'data_file': 'UT.xml', 'clean_file': 'UT_clean.xml', 'repo_id': 'nnc-ut' }] transform_paths = [] # error_pattern = re.compile("^SAXON ERROR") for r in the_repos: raw_file_path = marc_data_folder + '/' + r['data_file'] clean_file_path = marc_data_folder + '/' + r['clean_file'] repo_id = r['repo_id'] out_path = solr_output_folder + '/' + repo_id + '_solr' + '.xml' the_params = 'repo=' + repo_id repo_msg = 'Processing ' + r['data_file'] + '...' print(repo_msg) if reporting: digester.post_digest(script_name, repo_msg) # reporting # strip out bad characters if any. See ACFA-270. res = acfa.sanitize_xml(raw_file_path, clean_file_path) if res: print(res) if reporting: digester.post_digest(script_name, res) # reporting # transform to solr xml response = acfa.run_saxon(saxon_path, clean_file_path, xslt_path, out_path, theParams=the_params) print(response) if reporting: digester.post_digest(script_name, response) # reporting if "ERROR" not in response: transform_paths.append(out_path) return transform_paths
def log_it(script, log): if DEBUG is not True: digester.post_digest(script, log)
d = "" the_row.append(d) # print(the_row) # Handle subclassifications if 'terms' in s: the_terms = s['terms'] for t in the_terms: the_row.append(t['term'] + ' [' + t['term_type'] + ']') the_output.append(the_row) the_sheet.clear() save = the_sheet.appendData(the_output) print(save) digester.post_digest(script_name, "Total subject records: " + str(len(the_output) - 1)) # Generate log now2 = datetime.datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = (str(subj_cnt) + " subject records imported by " + my_name + ". " + " Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") log_range = "log!A:A" log_sheet = dataSheet(sheet_id, log_range) log_sheet.appendData([[the_log]])
def main(): # set to True to use test sheet and test json folder location. debug = False asf.setServer("Prod") my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = "" # set later today_str = str((date.today()).strftime("%Y%m%d")) if debug: print("[Running script in debug mode...]") parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources" # test folder sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U" # test sheet the_repos = [4] # to test else: parent_folder = "/cul/cul0/ldpd/archivesspace/resources" sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg" the_repos = [2, 3, 4, 5, 6] output_folder = parent_folder + "/" + today_str the_sheets = { "resources": dataSheet(sheet_id, "Resources!A:Z"), "cm": dataSheet(sheet_id, "Collection Management!A:Z"), "log": dataSheet(sheet_id, "log!A:Z"), } # Set number of chars to truncate the scope and bioghist notes. trunc_len = 400 # List of fields to extract, expressed as dpaths. the_fields = [ ["bibid", "/id_0"], ["title", "/title"], ["published", "/publish"], ["create_time", "/create_time"], ["system_mtime", "/system_mtime"], ["created_by", "/created_by"], ["last_modified_by", "/last_modified_by"], ["ead_location", "/ead_location"], ["ext_number", "/extents/0/number"], ["ext_portion", "/extents/0/portion"], ["ext_type", "/extents/0/extent_type"], # ["integer_1", "/user_defined/integer_1"], # ["integer_2", "/user_defined/integer_2"], # ["integer_3", "/user_defined/integer_3"], ["local call no.", "/user_defined/string_1"], ["other ctrl no. 1", "/user_defined/string_2"], ["other ctrl no. 2", "/user_defined/string_3"], ["other ctrl no. 3", "/user_defined/string_4"], # ["enum_1", "/user_defined/enum_1"], # ["enum_2", "/user_defined/enum_2"], ["description status", "/user_defined/enum_3"], ["collecting area", "/user_defined/enum_4"], ["level", "level"] # (Scope and bioghist notes are added in separately below.) ] # Get the collection management records for use in report. the_cms = [] fields = [ "id", "parent_id", "title", "system_mtime", "processing_priority", "processing_status", ] print(" ") print("*** Retrieve Collection Management Data ***") print(" ") for r in the_repos: print("Getting collection management records for repo: " + str(r) + "...") cm = asf.getCollectionManagements(r, filter="resource", fields=fields) for c in cm: row = [c[f] for f in fields] the_cms.append(row) # a data set of collection managment records to post to sheet below. the_cms.insert(0, fields) print(" ") print("*** Retrieve Resource Data ***") print(" ") # Get the list of resources for each repo and add to the_ids the_ids = [] for r in the_repos: print("Getting ids for repo: " + str(r) + "...") asids = json.loads( asf.getResponse("/repositories/" + str(r) + "/resources?all_ids=true")) print(str(len(asids)) + " records found in repo " + str(r) + ".") for i in asids: the_ids.append([r, i]) # Construct the head row the_heads = [x[0] for x in the_fields] the_heads.insert(0, "asid") the_heads.insert(0, "repo") the_heads.append("scope note") the_heads.append("scopenote length") the_heads.append("bioghist note") the_heads.append("biognote length") the_output = [the_heads] # Fetch the resources from the ids print("Downloading resources...") if not os.path.exists(output_folder): print("Creating directory " + output_folder + "...") os.makedirs(output_folder) for repo, asid in the_ids: # print("Processsing " + str(repo) + ":" + str(asid) + "...") the_row = [repo, asid] res_json = asf.getResource(repo, asid) res_dict = json.loads(res_json) out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json" # Write the JSON to file. with open(out_path, "w+") as f: f.write(res_json) # Use dpath to extract values from dict and compose into rows. for af in the_fields: try: d = str(dpath.util.get(res_dict, af[1])) except: d = "" the_row.append(d) # Process scope and bioghist notes the_notes = dpath.util.values(res_dict, "notes/*", afilter=None) the_scope_notes = [] the_biog_notes = [] for a_note in the_notes: try: if a_note["type"] == "scopecontent": the_scope_notes.append(a_note) except: pass try: if a_note["type"] == "bioghist": the_biog_notes.append(a_note) except: pass if the_scope_notes: # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars. scope_note_texts = [ s["subnotes"][0]["content"] for s in the_scope_notes ] the_scope_text = " ".join(scope_note_texts) scope_note_len = len(the_scope_text) scope_note_short = truncate_str(the_scope_text, length=trunc_len) else: scope_note_short = "" scope_note_len = 0 if the_biog_notes: # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars. biog_note_texts = [ s["subnotes"][0]["content"] for s in the_biog_notes ] the_biog_text = " ".join(biog_note_texts) biog_note_len = len(the_biog_text) biog_note_short = truncate_str(the_biog_text, length=trunc_len) else: biog_note_short = "" biog_note_len = 0 the_row.append(scope_note_short) the_row.append(str(scope_note_len)) the_row.append(biog_note_short) the_row.append(str(biog_note_len)) the_output.append(the_row) # Zip up the JSON files for storage. zip_out = make_archive(today_str, "zip", root_dir=parent_folder, base_dir=today_str) print(zip_out) # Zip is saved in working dir; move to correct location. print("Saving zip file " + str(today_str) + ".zip to " + parent_folder) # Test if file already exists. if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"): print("File " + parent_folder + "/" + str(today_str) + ".zip exists already. Replacing with new zip file...") os.remove(parent_folder + "/" + str(today_str) + ".zip") move(zip_out, parent_folder) # Remove the json folder once zip is in place. rmtree(parent_folder + "/" + today_str) util.file_cleanup(parent_folder, 60) # Write output to Google sheet. print(" ") print("*** Writing Data to Report ***") print(" ") the_sheets["cm"].clear() the_sheets["cm"].appendData(the_cms) digester.post_digest( script_name, "Total collection management records: " + str(len(the_cms) - 1)) the_sheets["resources"].clear() the_sheets["resources"].appendData(the_output) digester.post_digest( script_name, "Total number of resource records: " + str(len(the_output) - 1)) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("Data imported by " + my_name + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") the_sheets["log"].appendData([[the_log]]) print(" ") print(the_log) print(" ") exit_msg = "Script done. Updated data is available at " + \ the_sheets["resources"].url print(exit_msg) digester.post_digest(script_name, exit_msg)
def log_it(msg): print(msg) digester.post_digest(SCRIPT_NAME, msg)
def main(): asf.setServer('Prod') # AS instance: Prod | Dev | Test mode = 'Prod' # Prod or Test my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = '' # set later # today_str = str(date.today().strftime("%Y%m%d")) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d")) ######################## ### PROCESS OAI DATA ### ######################## # Set path to Saxon processor # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar") # XSLT file to generate report marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl') if mode == 'Prod': # OAI XML file to use as source # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest' source_dir = '/cul/cul0/ldpd/archivesspace/oai' sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY' oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml' else: # TEST yest_str = "20190915" # OAI XML file to use as source source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912' # local test sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ' oai_file = yest_str + '.asAllRaw.xml' the_sheets = { 'oai': dataSheet(sheet_id, 'oai!A:Z'), 'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'), 'log': dataSheet(sheet_id, 'log!A:Z') } the_outpath = os.path.join(my_path, 'output/' + yest_str + '.marc_reporter_out.xml') print(' ') # Copy oai current data to oai_last sheet for diff the_old_data = the_sheets['oai'].getData() the_sheets['oai_last'].clear() the_sheets['oai_last'].appendData(the_old_data) # Process OAI MARC and output to CSV util.saxon_process(oai_file, marc_xslt_file, the_outpath) # clear data from "new" sheet the_sheets['oai'].clear() # Send result csv to Google Sheet. y = the_sheets['oai'].importCSV(the_outpath, delim='|') print(' ') ######################## ### PROCESS UNPUBLISHED ### ######################## print('Finding unpublished records...') the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'json' ] the_heads = [ 'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME', 'SYSTEM_MTIME', 'LAST_MODIFIED_BY' ] unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z') the_unpublished = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getUnpublished(r, filter='resources', fields=the_fields) # print(x) for a in x: row = [a[v] for v in the_fields] # print(row) my_json = json.loads(row.pop(6)) try: call_no = my_json['user_defined']['string_1'] except: call_no = '' # get the repo from the uri string. repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the asid from the uri string. asid = int(str(row[0].split('/')[-1]).rstrip()) row.pop(0) row.insert(0, asid), row.insert(0, repo_id) if 'UA' in call_no: repo = 'nnc-ua' else: repo = get_repo(repo_id) row.insert(0, repo) the_unpublished.append(row) print('Repo ' + str(r) + ': ' + str(len(x))) # print('Total unpublished: ' + str(len(the_unpublished))) msg = 'Total unpublished: ' + str(len(the_unpublished)) print(msg) digester.post_digest(script_name, msg) # Test unpubs_sheet.clear() unpubs_sheet.appendData([the_heads]) unpubs_sheet.appendData(the_unpublished) ######################## ### GET NEWLY CREATED ### ######################## data_data = [{ 'range': 'resource-changes!A:Z', 'filter': 'resources' }, { 'range': 'accession-changes!A:Z', 'filter': 'accessions' }] for d in data_data: print('processing ' + d['filter']) the_delta_sheet = dataSheet(sheet_id, d['range']) the_date = yest_str # the_date = '2019-08-27' the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_heads = [ 'repo', 'asid', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_modifieds = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getByDate(r, the_date, date_type='ctime', comparator='equal', filter=d['filter'], fields=the_fields) for a in x: row = [a[v] for v in the_fields] # print(row) # get the repo from the uri string. repo = str(row[0].split('/')[-3]).rstrip() # get the asid from the uri string. asid = str(row[0].split('/')[-1]).rstrip() row.pop(0) row.insert(0, asid), row.insert(0, repo) the_modifieds.append(row) # print(list(a.values())) # the_modifieds.append(list(a.values())) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total ' + d['filter'] + ': ' + str(len(the_modifieds))) digester.post_digest(script_name, 'New ' + d['filter'] + ': ' + str(len(the_modifieds))) # Test # the_sheet.clear() # the_sheet.appendData([the_fields]) the_delta_sheet.appendData(the_modifieds) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \ '. Finished: ' + end_time + ' (duration: ' + my_duration + ').' the_sheets['log'].appendData([[the_log]]) print(' ') print(the_log) digester.post_digest(script_name, the_log) # Test print(' ') print('Script done. Updated data is available at ' + the_sheets['oai'].url)
"name": "persons", "endpoint": "/agents/people", }, ] for i in the_info: print("Getting agents: " + i["name"]) out_path = os.path.join(my_path, "output/agents_" + i["name"] + ".pickle") # Get a list of agent ids from API agents_list = json.loads(asf.getResponse(i["endpoint"] + "?all_ids=true")) agent_cnt_str = "Number of agents (" + \ i['name'] + "): " + str(len(agents_list)) print(agent_cnt_str) digester.post_digest(script_name, agent_cnt_str) cnt = 0 agent_data = [] # Loop through agent ids and get full record from API. for agent in agents_list: cnt += 1 # print("COUNT: " + str(cnt)) # print("Agent # " + str(agent)) x = asf.getResponse(i["endpoint"] + "/" + str(agent)) agent_data.append(json.loads(x)) # Save data as pickle util.pickle_it(agent_data, out_path)
def main(): # Set to True to harvest complete set; otherwise will select based on date. HARVESTALL = False my_name = __file__ my_path = os.path.dirname(__file__) script_name = os.path.basename(my_name) # calculate dates in format yyyymmdd today = datetime.date.today().strftime("%Y%m%d") yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y%m%d") destination_folder = "/cul/cul0/ldpd/archivesspace/oai" # destination_folder = "/cul/cul0/ldpd/archivesspace/test" # test # destination_folder = "./" # test xslt_path = os.path.join(my_path, "../xslt/cleanOAI.xsl") out_path_raw = os.path.join(destination_folder, today + ".asRaw.xml") out_path_raw_all = os.path.join(destination_folder, today + ".asAllRaw.xml") out_path_clean = os.path.join(destination_folder, today + ".asClean.xml") # Set server to Prod | Test | Dev server = "Prod" fromDate = yesterday # # Not using date, get all records and then filter with the XSLT! # date_params = "" # Select date interval for harvest # TODO: change this to be controlled by param file. if HARVESTALL == True: date_params = " " # Use this to harvest all records. else: date_params = "-f " + yesterday # Harvest OAI-PMH data print("Harvesting data from OAI...") util.oai_harvest(out_path_raw, server=server, date_params=date_params) # Process through XSLT # TODO: change xsl to not require this param, if we are doing it in the harvest! time_offset = 'P800DT30H' saxon_params = " time_offset=" + time_offset print("Processing file with XSLT...") x = util.saxon_process(out_path_raw, xslt_path, out_path_clean, theParams=saxon_params) print(x) digester.post_digest(script_name, x) print("Harvesting all records for reporting ...") date_params = " " util.oai_harvest(out_path_raw_all, server=server, date_params=date_params) # Remove old OAI files util.file_cleanup(destination_folder, 30) digester.post_digest( script_name, script_name + ' completed at ' + str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S')) + '.')
acc_info.pop("extents") # Clean up titles acc_info['title'] = str(acc_info['title']).strip() # Uncomment to list records in log. # print("processing: " + str(acc_info["uri"]).strip() + ' / ' + str(acc_info["title"]).strip() ) all_rows.append(acc_info) processed_msg = 'Processed ' + \ str(len(all_rows)) + ' records in ' + repo_name + '.' print(processed_msg) digester.post_digest(script_name, processed_msg) # the_heads = list(all_rows[0].keys()) # explicitly order the columns, as dict order is unpredictable. the_heads = [ 'title', 'uri', 'accession_date', 'id_0', 'id_1', 'id_2', 'id_3', 'integer_1', 'resource_bibid', 'resource_asid', 'repo', 'year', 'fiscal-year', 'ext-number', 'ext-portion', 'ext-type', 'processing_priority', 'processing_status', 'create_time', 'system_mtime', 'last_modified_by' ] the_output = [] # Build row in order specified by the_heads
def main(): report_level = "low" # 'low' = only parse/schema errors; 'high' = include schematron warnings my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later print("Script " + my_name + " begun at " + start_time + ". ") print(" ") ################################ # # Rsync files from web application to storage directory # ################################ print("====== Syncing files from production cache... ======") print(" ") keyPath = "/home/ldpdapp/.ssh/id_dsa" fromPath = ( "ldpdapp@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache" ) toPath = "/cul/cul0/ldpd/archivesspace/" myOptions = "--exclude 'clio*'" x = util.rsync_process(keyPath, fromPath, toPath, myOptions) print(x) print(" ") ################################ # # Perform validation reporting # ################################ print("====== Validating files... ======") print(" ") if report_level == "high": print('* Logging level: "' + report_level + '" — showing all errors and warnings. *') else: print( '* Logging level: "' + report_level + '" – showing only errors. Check report for complete results including warnings. *' ) print(" ") # The Google Sheet to send data to the_data_sheet = dataSheet("1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0", "validation!A:Z") # the_data_sheet = dataSheet( # '1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0', 'test!A:Z') # Test # This is a dupe for other reporting the_data_sheet2 = dataSheet("198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY", "validation!A:Z") # Set path to saxon processor for evaluator xslt saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar') # Set path to schema validator (Jing) jing_path = os.path.join(my_path, "../../resources/jing-20091111/bin/jing.jar") schema_filename = "schemas/cul_as_ead.rng" # schematron_filename = "schemas/cul_as_ead.sch" xslt_filename = "schemas/cul_as_ead.xsl" schema_path = os.path.join(my_path, schema_filename) xslt_path = os.path.join(my_path, xslt_filename) data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache" # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead' # for testing # Use in notification email to distinguish errors/warnings icons = { "redx": "\U0000274C", # use for parse errors "exclamation": "\U00002757", "warning": "\U000026A0\U0000FE0F", # use for schema validation errors "qmark": "\U00002753", } # Load files from directory into a list the_file_paths = [] for root, dirs, files in os.walk(os.path.abspath(data_folder)): for file in files: the_file_paths.append(os.path.join(root, file)) # The column heads for the report spreadsheet the_heads = [ "bibid", "file", "well-formed?", "valid?", "schema output", "schematron output", "warning type", ] the_results = [] the_results.append(the_heads) # counters parse_errors = 0 validation_errors = 0 sch_warnings = 0 for a_file in the_file_paths: the_file_data = [] file_name = a_file.split("/")[-1] bibid = file_name.split("_")[-1].split(".")[0] validation_result = util.jing_process(jing_path, a_file, schema_path) if "fatal:" in validation_result: # It's a parsing error. err_msg = icons["redx"] + " FATAL ERROR: " + \ file_name + " could not be parsed!" print(err_msg) digester.post_digest(script_name, err_msg) wf_status = False validation_status = False parse_errors += 1 else: wf_status = True if "error:" in validation_result: # It's a validation error. validation_status = False err_msg = icons["warning"] + " ERROR: " + \ file_name + " contains validation errors." print(err_msg) digester.post_digest(script_name, err_msg) validation_errors += 1 else: validation_status = True if validation_result: validation_result_clean = clean_output(validation_result, incl_types=False)[0] else: validation_result_clean = validation_result if wf_status == False: schematron_result_clean = "-" warning_types = [] else: # schematron_result = util.jing_process( # jing_path, a_file, schematron_path) schematron_result = util.saxon_process(saxon_path, a_file, xslt_path, None) if schematron_result: # It's a schematron violiation. if report_level == "high": # Only show if required by reporting level var (use to filter out large numbers of warnings). err_msg = "WARNING: " + file_name + " has Schematron rule violations." print(err_msg) digester.post_digest(script_name, err_msg) sch_warnings += 1 if schematron_result: x = clean_output(schematron_result, incl_types=True) schematron_result_clean = x[0] warning_types = x[1] else: schematron_result_clean = "" warning_types = "" the_file_data = [ bibid, file_name, wf_status, validation_status, validation_result_clean, schematron_result_clean, ", ".join(warning_types), ] the_results.append(the_file_data) the_data_sheet.clear() the_data_sheet.appendData(the_results) the_data_sheet2.clear() the_data_sheet2.appendData(the_results) # generate log and add to log tab, if exists. the_tabs = the_data_sheet.initTabs now2 = datetime.datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("EADs from " + data_folder + " evaluated by " + schema_filename + " and " + xslt_filename + ". Parse errors: " + str(parse_errors) + ". Schema errors: " + str(validation_errors) + ". Schematron warnings: " + str(sch_warnings) + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") if "log" in the_tabs: log_range = "log!A:A" # today = datetime.datetime.today().strftime('%c') dataSheet(the_data_sheet.id, log_range).appendData([[the_log]]) else: print("*** Warning: There is no log tab in this sheet. ***") print(" ") # print(the_log) print("Parse errors: " + str(parse_errors)) digester.post_digest(script_name, "Parse errors: " + str(parse_errors)) print("Schema errors: " + str(validation_errors)) digester.post_digest(script_name, "Schema errors: " + str(validation_errors)) print("Schematron warnings: " + str(sch_warnings)) digester.post_digest(script_name, "Schematron warnings: " + str(sch_warnings)) print(" ") exit_msg = "Script done. Check report sheet for more details: " + the_data_sheet.url print(exit_msg) digester.post_digest(script_name, exit_msg) quit()