def test_saxon_process(): # Test XSLT transform using Saxon in_file = os.path.join(MY_PATH, '../xslt/OAI_SAMPLE.asClean.xml') xsl_file = os.path.join(MY_PATH, '../xslt/extract-bibids.xsl') params = 'filename=' + in_file x = util.saxon_process(in_file, xsl_file, None, theParams=params) assert 'BibID: 4078817' in x
def check_clio(date, filepath): # Get a list of BIBIDs from stylesheet x = util.saxon_process(filepath, XSLT_PATH, None) the_deltas = x.split(',') print(the_deltas) if len(the_deltas) < 1: print("No bibids found in " + str(filepath) + ". Bypassing CLIO check.") quit() # Check to see if the datestamp in the 005 field matches the date from the delta update. # Allow a couple of retries, as some MARC records are very large and # may not be loadable by http. retry_max = 2 retries = 0 # Choose one random one to look up bibid = random.choice(the_deltas) the_bibids_tried = [] while retries < retry_max: while bibid in the_bibids_tried: bibid = random.choice(the_deltas) the_bibids_tried.append(bibid) # print(bibid) # print(retries) try: datestamp = read_005(bibid) if datestamp == date: return True print("WARNING: 005 data for " + str(bibid) + " (" + datestamp + ") does not match " + str(date)) return False # retries = retry_max except Exception as e: if "request error" in str(e): retries += 1 raise Exception( "CLIO error: Could not verify that datestamps have been updated! " + str(e))
def main(): my_name = __file__ # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d")) storage_dir = "/cul/cul0/ldpd/archivesspace/" # saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar') # XSLT for transformation. Accepts a path in param in which to save the html snippet files. xsl_filename = '../xslt/generate_browse_list.xsl' xsl_path = os.path.join(my_path, xsl_filename) # Use the OAI file from previous day as source to generate lists. input_filename = yest_str + ".asAllRaw.xml" input_path = storage_dir + "oai/" + input_filename print("Input file: " + input_path) # The location for the stylesheet to save output documents. output_path = storage_dir + "fa_lists" # output_path = storage_dir + "test" # test print("Output location: " + output_path) # output_path = os.path.join(my_path, 'output/fa_lists') # test params = "output_dir=" + output_path # x = util.saxon_process(input_path, xsl_path, None, params) x = util.saxon_process(input_path, xsl_path, None, theParams=params) print(x)
def main(): MY_NAME = __file__ global SCRIPT_NAME SCRIPT_NAME = os.path.basename(MY_NAME) # This makes sure the script can be run from any working directory and still find related files. MY_PATH = os.path.dirname(__file__) sheet_id = "1Ltf5_hhR-xN4YSvNWmPX8bqJA1UjqAaSjgeHBr_5chA" parse_sheet = dataSheet(sheet_id, "parse!A:Z") # Test validation_sheet = dataSheet(sheet_id, "schema!A:Z") # Test eval_sheet = dataSheet(sheet_id, "eval!A:Z") # Test now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later ################################ # # Rsync files from web application to storage directory # ################################ print("====== Syncing files from production cache... ======") print(" ") fromPath = "ldpdserv@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache" toPath = "/cul/cul0/ldpd/archivesspace/" myOptions = "--exclude 'clio*' --exclude '*.txt'" x = util.rsync_process(fromPath, toPath, myOptions) print(x) print(" ") ################################ # # Perform validation reporting # ################################ schema_path = os.path.join(MY_PATH, "../schemas/cul_as_ead.rng") csv_out_path = os.path.join(MY_PATH, "temp_out.txt") xslt_path = os.path.join(MY_PATH, "../schemas/cul_as_ead.xsl") data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache" # data_folder = "/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/ead_rsync_test" # test # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead' # for testing # Use in notification email to distinguish errors/warnings icons = { "redx": "\U0000274C", "exclamation": "\U00002757", "warning": "\U000026A0\U0000FE0F", "qmark": "\U00002753", } # check for malformed xml. If there is, then don't do further validation because it will fail once it hits an unparseable file. print(" ") print("====== Checking well-formedness ... ======") parse_errs = [] try: x = util.run_bash("xmllint " + data_folder + "/* --noout", errorPrefix="PARSE") # print(x) log_it("All files well-formed.") except Exception as e: if "PARSEERROR" in str(e): parse_errs = [ msg_parse(l, icons["redx"]) for l in str(e).splitlines() if "as_ead" in l ] parse_errs = clean_array(parse_errs) if parse_errs: for e in get_unique_bibid_all_errors(parse_errs): log_it(icons["redx"] + "PARSE ERROR: " + e) parse_err_cnt = get_unique_count(parse_errs) if parse_errs: log_it( "There were " + str(parse_err_cnt) + " unparseable records! Validation of files could not be completed. Fix syntax and run script again." ) parse_sheet.clear() parse_sheet.appendData(parse_errs) quit() # No parsing errors, so proceed... parse_sheet.clear() print(" ") print("====== Validating files... ======") # Batch validate against RNG schema. x = util.jing_process_batch(data_folder, schema_path, "as_ead*") schema_errs = [ msg_parse(l, icons["exclamation"]) for l in str(x).splitlines() if "as_ead" in l ] schema_err_cnt = get_unique_count(schema_errs) if schema_errs: for e in get_unique_bibid_all_errors(schema_errs): log_it(icons["exclamation"] + "VALIDATION ERROR: " + e) else: log_it("All files are valid.") validation_sheet.clear() validation_sheet.appendData(schema_errs) print(" ") print("====== Evaluating with XSLT ... ======") try: x = util.saxon_process(xslt_path, xslt_path, csv_out_path, theParams="filePath=" + data_folder) eval_sheet.clear() eval_sheet.importCSV(csv_out_path, delim="|") except Exception as e: if "SAXON ERROR" in str(e): print("Cancelled!") evals = eval_sheet.getDataColumns()[0] eval_bibs = set(evals) warnings_cnt = len(eval_bibs) if evals: log_it(icons["warning"] + " " + str(len(evals)) + " warnings in " + str(warnings_cnt) + " files.") else: log_it("There were no problems found!") the_tabs = validation_sheet.initTabs now2 = datetime.datetime.now() end_time = str(now2) if "log" in the_tabs: log_range = "log!A:A" my_duration = str(now2 - now1) the_log = ("EADs from " + data_folder + " evaluated by " + schema_path + " and " + xslt_path + ". Parse errors: " + str(parse_err_cnt) + ". Schema errors: " + str(schema_err_cnt) + ". XSLT warnings: " + str(warnings_cnt) + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") # today = datetime.datetime.today().strftime('%c') dataSheet(validation_sheet.id, log_range).appendData([[the_log]]) else: print("*** Warning: There is no log tab in this sheet. ***") print(" ") # print(the_log) log_it("Files with parse errors: " + str(parse_err_cnt)) log_it("Files with schema errors: " + str(schema_err_cnt)) log_it("Files with warnings: " + str(warnings_cnt)) print(" ") exit_msg = ("Script done. Check report sheet for more details: " + validation_sheet.url) log_it(exit_msg) quit()
x = the_sheet.lookup('4079432', 0, 1) print(x) print(' ') print('testing archivesspace api...') x = asf.getResource(2, 5907) print(x) print(' ') print("testing saxon ...") saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar') source_dir = '/cul/cul0/ldpd/archivesspace/oai' in_file = os.path.join(source_dir, '20201111.asClean.xml') xsl_file = os.path.join(my_path, '../xslt/extract-bibids.xsl') params = 'filename=' + in_file x = util.saxon_process(in_file, xsl_file, None, theParams=params) print(x) print("This is a test!") print("Yes it worked...") #
def main(): # x = get_oapen_item(627426) # pprint(x) xml_dir = '/Users/dwh2128/Documents/SimplyE/books/Gutenberg/epub/' xslt_path = '/Users/dwh2128/Documents/SimplyE/books/Gutenberg/gutenberg_to_opds.xsl' output_folder = 'output/oa_clio/aaw/' sheet_id = '1aS2zZzDOAzr-LwNGjhxEofIfLBWIO0XM2Ft43Ec1amo' sheet_tab = 'AAW' # sheet_tab = 'Sheet1' # sheet_tab = 'Test' feed_stem = 'gutenberg_feed' collection_title = "Project Gutenberg EBooks | Columbia University Libraries" print('Extracting ' + sheet_tab + ' ... ') the_info = get_collection(sheet_id, sheet_tab, feed_stem, collection_title, multipart=False) # Divide list into chunks # chunk_size = 5 chunk_size = 500 total_count = len(the_info) print('Total count: ' + str(total_count)) running_count = 0 the_chunks = divide_list(the_info, chunk_size) for idx, record_chunk in enumerate(the_chunks): running_count += len(record_chunk) print('Running_count = ' + str(running_count)) print('') page_no = idx + 1 if page_no > 1: feed_name = feed_stem + '_p' + str(page_no) + '.xml' feed_list_name = feed_stem + '_list_p' + str(page_no) + '.xml' else: feed_name = feed_stem + '.xml' feed_list_name = feed_stem + '_list' + '.xml' # Add feed_next, only if it is not the last one if running_count < total_count: feed_next_name = feed_stem + '_p' + str(page_no + 1) + '.xml' feed_next_path = 'https://ebooks.library.columbia.edu/static-feeds/oa_clio/' + feed_next_name else: feed_next_name = '' feed_next_path = '' root = etree.Element("records") for r in record_chunk: rdf_path = xml_dir + str(r['id']) + '/pg' + str(r['id']) + '.rdf' # Look to verify that there is an RDF file to get data from. if os.path.exists(rdf_path): rec = etree.SubElement(root, "record") bibid = etree.SubElement(rec, "bibid") bibid.text = r['bibid'] bookid = etree.SubElement(rec, "bookid") bookid.text = r['id'] else: print("Warning: could not find RDF file for " + str(r['id'])) # print(etree.tostring(root, pretty_print=True)) list_file_path = 'output/' + feed_list_name with open(list_file_path, 'wb') as f: f.write(etree.tostring(root, pretty_print=True)) # feed_file_name = feed_stem + '.xml' util.saxon_process( list_file_path, xslt_path, output_folder + feed_name, theParams= 'feedURL=https://ebooks.library.columbia.edu/static-feeds/oa_clio/' + feed_name + ' feedNext=' + feed_next_path) val = validate_files(output_folder) the_errors = [f for f in val if f['errors']] if the_errors: print(the_errors) else: print("All files are valid!") quit()
def main(): asf.setServer('Prod') # AS instance: Prod | Dev | Test mode = 'Prod' # Prod or Test my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.now() start_time = str(now1) end_time = '' # set later # today_str = str(date.today().strftime("%Y%m%d")) yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d")) ######################## ### PROCESS OAI DATA ### ######################## # Set path to Saxon processor # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar") # XSLT file to generate report marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl') if mode == 'Prod': # OAI XML file to use as source # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest' source_dir = '/cul/cul0/ldpd/archivesspace/oai' sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY' oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml' else: # TEST yest_str = "20190915" # OAI XML file to use as source source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912' # local test sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ' oai_file = yest_str + '.asAllRaw.xml' the_sheets = { 'oai': dataSheet(sheet_id, 'oai!A:Z'), 'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'), 'log': dataSheet(sheet_id, 'log!A:Z') } the_outpath = os.path.join(my_path, 'output/' + yest_str + '.marc_reporter_out.xml') print(' ') # Copy oai current data to oai_last sheet for diff the_old_data = the_sheets['oai'].getData() the_sheets['oai_last'].clear() the_sheets['oai_last'].appendData(the_old_data) # Process OAI MARC and output to CSV util.saxon_process(oai_file, marc_xslt_file, the_outpath) # clear data from "new" sheet the_sheets['oai'].clear() # Send result csv to Google Sheet. y = the_sheets['oai'].importCSV(the_outpath, delim='|') print(' ') ######################## ### PROCESS UNPUBLISHED ### ######################## print('Finding unpublished records...') the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'json' ] the_heads = [ 'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME', 'SYSTEM_MTIME', 'LAST_MODIFIED_BY' ] unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z') the_unpublished = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getUnpublished(r, filter='resources', fields=the_fields) # print(x) for a in x: row = [a[v] for v in the_fields] # print(row) my_json = json.loads(row.pop(6)) try: call_no = my_json['user_defined']['string_1'] except: call_no = '' # get the repo from the uri string. repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the asid from the uri string. asid = int(str(row[0].split('/')[-1]).rstrip()) row.pop(0) row.insert(0, asid), row.insert(0, repo_id) if 'UA' in call_no: repo = 'nnc-ua' else: repo = get_repo(repo_id) row.insert(0, repo) the_unpublished.append(row) print('Repo ' + str(r) + ': ' + str(len(x))) # print('Total unpublished: ' + str(len(the_unpublished))) msg = 'Total unpublished: ' + str(len(the_unpublished)) print(msg) digester.post_digest(script_name, msg) # Test unpubs_sheet.clear() unpubs_sheet.appendData([the_heads]) unpubs_sheet.appendData(the_unpublished) ######################## ### GET NEWLY CREATED ### ######################## data_data = [{ 'range': 'resource-changes!A:Z', 'filter': 'resources' }, { 'range': 'accession-changes!A:Z', 'filter': 'accessions' }] for d in data_data: print('processing ' + d['filter']) the_delta_sheet = dataSheet(sheet_id, d['range']) the_date = yest_str # the_date = '2019-08-27' the_repos = [2, 3, 4, 5] the_fields = [ 'id', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_heads = [ 'repo', 'asid', 'title', 'identifier', 'create_time', 'system_mtime', 'last_modified_by', 'publish' ] the_modifieds = [] for r in the_repos: print('searching repo ' + str(r)) x = asf.getByDate(r, the_date, date_type='ctime', comparator='equal', filter=d['filter'], fields=the_fields) for a in x: row = [a[v] for v in the_fields] # print(row) # get the repo from the uri string. repo = str(row[0].split('/')[-3]).rstrip() # get the asid from the uri string. asid = str(row[0].split('/')[-1]).rstrip() row.pop(0) row.insert(0, asid), row.insert(0, repo) the_modifieds.append(row) # print(list(a.values())) # the_modifieds.append(list(a.values())) print('Repo ' + str(r) + ': ' + str(len(x))) print('Total ' + d['filter'] + ': ' + str(len(the_modifieds))) digester.post_digest(script_name, 'New ' + d['filter'] + ': ' + str(len(the_modifieds))) # Test # the_sheet.clear() # the_sheet.appendData([the_fields]) the_delta_sheet.appendData(the_modifieds) ######################## ### FINISH UP ### ######################## # Generate log string. now2 = datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \ '. Finished: ' + end_time + ' (duration: ' + my_duration + ').' the_sheets['log'].appendData([[the_log]]) print(' ') print(the_log) digester.post_digest(script_name, the_log) # Test print(' ') print('Script done. Updated data is available at ' + the_sheets['oai'].url)
def main(): MY_NAME = __file__ global SCRIPT_NAME SCRIPT_NAME = os.path.basename(MY_NAME) # This makes sure the script can be run from any working directory and still find related files. MY_PATH = os.path.dirname(__file__) sheet_id = '1Ltf5_hhR-xN4YSvNWmPX8bqJA1UjqAaSjgeHBr_5chA' parse_sheet = dataSheet(sheet_id, 'parse!A:Z') # Test validation_sheet = dataSheet(sheet_id, 'schema!A:Z') # Test eval_sheet = dataSheet(sheet_id, 'eval!A:Z') # Test # This is a dupe for other reporting # the_data_sheet2 = dataSheet( # "198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY", "validation!A:Z") now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later ################################ # # Rsync files from web application to storage directory # ################################ print("====== Syncing files from production cache... ======") print(" ") # keyPath = "/home/ldpdserv/.ssh/id_dsa" fromPath = ( "ldpdserv@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache" ) toPath = "/cul/cul0/ldpd/archivesspace/" myOptions = "--exclude 'clio*'" x = util.rsync_process(fromPath, toPath, myOptions) print(x) print(" ") ################################ # # Perform validation reporting # ################################ schema_path = os.path.join(MY_PATH, "../schemas/cul_as_ead.rng") csv_out_path = os.path.join(MY_PATH, "temp_out.txt") xslt_path = os.path.join(MY_PATH, "../schemas/cul_as_ead2.xsl") # test data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache" # data_folder = "/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/ead_rsync_test" # test # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead' # for testing # Use in notification email to distinguish errors/warnings icons = { "redx": "\U0000274C", "exclamation": "\U00002757", "warning": "\U000026A0\U0000FE0F", "qmark": "\U00002753", } # check for malformed xml. If there is, then don't do further validation because it will fail once it hits an unparseable file. print(" ") print("====== Checking well-formedness ... ======") parse_errs = [] try: x = util.run_bash('xmllint ' + data_folder + '/* --noout', errorPrefix='PARSE') # print(x) log_it("All files well-formed.") except Exception as e: if 'PARSEERROR' in str(e): parse_errs = [ msg_parse(l, icons['redx']) for l in str(e).splitlines() if 'as_ead' in l ] # print(parse_errs) for e in get_unique_bibids(parse_errs): log_it(icons['redx'] + " " + str(e) + " has parsing errors.") parse_err_cnt = get_unique_count(parse_errs) if parse_errs: log_it( 'There were ' + str(parse_err_cnt) + ' unparseable records! Validation of files could not be completed. Fix syntax and run script again.' ) parse_sheet.clear() parse_sheet.appendData(parse_errs) quit() # No parsing errors, so proceed... parse_sheet.clear() print(" ") print("====== Validating files... ======") # Validate against schema. Xargs batches files so they won't exceed # limit on arguments with thousands of files. x = util.run_bash('find ' + data_folder + ' -name "as_ead*" | xargs -L 128 java -jar ' + util.config['FILES']['jingPath'] + ' -d ' + schema_path, errorPrefix='JING') schema_errs = [ msg_parse(l, icons['exclamation']) for l in str(x).splitlines() if 'as_ead' in l ] schema_err_cnt = get_unique_count(schema_errs) if schema_errs: for e in get_unique_bibids(schema_errs): log_it(icons['exclamation'] + " " + str(e) + " has validation errors.") else: log_it("All files are valid.") validation_sheet.clear() validation_sheet.appendData(schema_errs) print(" ") print("====== Evaluating with XSLT ... ======") try: x = util.saxon_process(xslt_path, xslt_path, csv_out_path, theParams='filePath=' + data_folder) eval_sheet.clear() eval_sheet.importCSV(csv_out_path, delim='|') except Exception as e: if "SAXON ERROR" in str(e): print("Cancelled!") evals = eval_sheet.getDataColumns()[0] eval_bibs = set(evals) warnings_cnt = len(eval_bibs) if evals: log_it(icons['warning'] + " " + str(len(evals)) + " warnings in " + str(warnings_cnt) + " files.") else: log_it("There were no problems found!") the_tabs = validation_sheet.initTabs now2 = datetime.datetime.now() end_time = str(now2) if "log" in the_tabs: log_range = "log!A:A" my_duration = str(now2 - now1) the_log = ("EADs from " + data_folder + " evaluated by " + schema_path + " and " + xslt_path + ". Parse errors: " + str(parse_err_cnt) + ". Schema errors: " + str(schema_err_cnt) + ". XSLT warnings: " + str(warnings_cnt) + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") # today = datetime.datetime.today().strftime('%c') dataSheet(validation_sheet.id, log_range).appendData([[the_log]]) else: print("*** Warning: There is no log tab in this sheet. ***") print(" ") # print(the_log) log_it("Files with parse errors: " + str(parse_err_cnt)) log_it("Files with schema errors: " + str(schema_err_cnt)) log_it("Files with warnings: " + str(warnings_cnt)) print(" ") exit_msg = "Script done. Check report sheet for more details: " + validation_sheet.url log_it(exit_msg) quit()
def main(): # Set to True to harvest complete set; otherwise will select based on date. HARVESTALL = False my_name = __file__ my_path = os.path.dirname(__file__) script_name = os.path.basename(my_name) # calculate dates in format yyyymmdd today = datetime.date.today().strftime("%Y%m%d") yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y%m%d") destination_folder = "/cul/cul0/ldpd/archivesspace/oai" # destination_folder = "/cul/cul0/ldpd/archivesspace/test" # test # destination_folder = "./" # test xslt_path = os.path.join(my_path, "../xslt/cleanOAI.xsl") out_path_raw = os.path.join(destination_folder, today + ".asRaw.xml") out_path_raw_all = os.path.join(destination_folder, today + ".asAllRaw.xml") out_path_clean = os.path.join(destination_folder, today + ".asClean.xml") # Set server to Prod | Test | Dev server = "Prod" fromDate = yesterday # # Not using date, get all records and then filter with the XSLT! # date_params = "" # Select date interval for harvest # TODO: change this to be controlled by param file. if HARVESTALL == True: date_params = " " # Use this to harvest all records. else: date_params = "-f " + yesterday # Harvest OAI-PMH data print("Harvesting data from OAI...") util.oai_harvest(out_path_raw, server=server, date_params=date_params) # Process through XSLT # TODO: change xsl to not require this param, if we are doing it in the harvest! time_offset = 'P800DT30H' saxon_params = " time_offset=" + time_offset print("Processing file with XSLT...") x = util.saxon_process(out_path_raw, xslt_path, out_path_clean, theParams=saxon_params) print(x) digester.post_digest(script_name, x) print("Harvesting all records for reporting ...") date_params = " " util.oai_harvest(out_path_raw_all, server=server, date_params=date_params) # Remove old OAI files util.file_cleanup(destination_folder, 30) digester.post_digest( script_name, script_name + ' completed at ' + str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S')) + '.')
def main(): report_level = "low" # 'low' = only parse/schema errors; 'high' = include schematron warnings my_name = __file__ script_name = os.path.basename(my_name) # This makes sure the script can be run from any working directory and still find related files. my_path = os.path.dirname(__file__) now1 = datetime.datetime.now() start_time = str(now1) end_time = "" # set later print("Script " + my_name + " begun at " + start_time + ". ") print(" ") ################################ # # Rsync files from web application to storage directory # ################################ print("====== Syncing files from production cache... ======") print(" ") keyPath = "/home/ldpdapp/.ssh/id_dsa" fromPath = ( "ldpdapp@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache" ) toPath = "/cul/cul0/ldpd/archivesspace/" myOptions = "--exclude 'clio*'" x = util.rsync_process(keyPath, fromPath, toPath, myOptions) print(x) print(" ") ################################ # # Perform validation reporting # ################################ print("====== Validating files... ======") print(" ") if report_level == "high": print('* Logging level: "' + report_level + '" — showing all errors and warnings. *') else: print( '* Logging level: "' + report_level + '" – showing only errors. Check report for complete results including warnings. *' ) print(" ") # The Google Sheet to send data to the_data_sheet = dataSheet("1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0", "validation!A:Z") # the_data_sheet = dataSheet( # '1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0', 'test!A:Z') # Test # This is a dupe for other reporting the_data_sheet2 = dataSheet("198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY", "validation!A:Z") # Set path to saxon processor for evaluator xslt saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar') # Set path to schema validator (Jing) jing_path = os.path.join(my_path, "../../resources/jing-20091111/bin/jing.jar") schema_filename = "schemas/cul_as_ead.rng" # schematron_filename = "schemas/cul_as_ead.sch" xslt_filename = "schemas/cul_as_ead.xsl" schema_path = os.path.join(my_path, schema_filename) xslt_path = os.path.join(my_path, xslt_filename) data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache" # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead' # for testing # Use in notification email to distinguish errors/warnings icons = { "redx": "\U0000274C", # use for parse errors "exclamation": "\U00002757", "warning": "\U000026A0\U0000FE0F", # use for schema validation errors "qmark": "\U00002753", } # Load files from directory into a list the_file_paths = [] for root, dirs, files in os.walk(os.path.abspath(data_folder)): for file in files: the_file_paths.append(os.path.join(root, file)) # The column heads for the report spreadsheet the_heads = [ "bibid", "file", "well-formed?", "valid?", "schema output", "schematron output", "warning type", ] the_results = [] the_results.append(the_heads) # counters parse_errors = 0 validation_errors = 0 sch_warnings = 0 for a_file in the_file_paths: the_file_data = [] file_name = a_file.split("/")[-1] bibid = file_name.split("_")[-1].split(".")[0] validation_result = util.jing_process(jing_path, a_file, schema_path) if "fatal:" in validation_result: # It's a parsing error. err_msg = icons["redx"] + " FATAL ERROR: " + \ file_name + " could not be parsed!" print(err_msg) digester.post_digest(script_name, err_msg) wf_status = False validation_status = False parse_errors += 1 else: wf_status = True if "error:" in validation_result: # It's a validation error. validation_status = False err_msg = icons["warning"] + " ERROR: " + \ file_name + " contains validation errors." print(err_msg) digester.post_digest(script_name, err_msg) validation_errors += 1 else: validation_status = True if validation_result: validation_result_clean = clean_output(validation_result, incl_types=False)[0] else: validation_result_clean = validation_result if wf_status == False: schematron_result_clean = "-" warning_types = [] else: # schematron_result = util.jing_process( # jing_path, a_file, schematron_path) schematron_result = util.saxon_process(saxon_path, a_file, xslt_path, None) if schematron_result: # It's a schematron violiation. if report_level == "high": # Only show if required by reporting level var (use to filter out large numbers of warnings). err_msg = "WARNING: " + file_name + " has Schematron rule violations." print(err_msg) digester.post_digest(script_name, err_msg) sch_warnings += 1 if schematron_result: x = clean_output(schematron_result, incl_types=True) schematron_result_clean = x[0] warning_types = x[1] else: schematron_result_clean = "" warning_types = "" the_file_data = [ bibid, file_name, wf_status, validation_status, validation_result_clean, schematron_result_clean, ", ".join(warning_types), ] the_results.append(the_file_data) the_data_sheet.clear() the_data_sheet.appendData(the_results) the_data_sheet2.clear() the_data_sheet2.appendData(the_results) # generate log and add to log tab, if exists. the_tabs = the_data_sheet.initTabs now2 = datetime.datetime.now() end_time = str(now2) my_duration = str(now2 - now1) the_log = ("EADs from " + data_folder + " evaluated by " + schema_filename + " and " + xslt_filename + ". Parse errors: " + str(parse_errors) + ". Schema errors: " + str(validation_errors) + ". Schematron warnings: " + str(sch_warnings) + ". Start: " + start_time + ". Finished: " + end_time + " (duration: " + my_duration + ").") if "log" in the_tabs: log_range = "log!A:A" # today = datetime.datetime.today().strftime('%c') dataSheet(the_data_sheet.id, log_range).appendData([[the_log]]) else: print("*** Warning: There is no log tab in this sheet. ***") print(" ") # print(the_log) print("Parse errors: " + str(parse_errors)) digester.post_digest(script_name, "Parse errors: " + str(parse_errors)) print("Schema errors: " + str(validation_errors)) digester.post_digest(script_name, "Schema errors: " + str(validation_errors)) print("Schematron warnings: " + str(sch_warnings)) digester.post_digest(script_name, "Schematron warnings: " + str(sch_warnings)) print(" ") exit_msg = "Script done. Check report sheet for more details: " + the_data_sheet.url print(exit_msg) digester.post_digest(script_name, exit_msg) quit()