Exemple #1
0
def test_saxon_process():
    # Test XSLT transform using Saxon
    in_file = os.path.join(MY_PATH, '../xslt/OAI_SAMPLE.asClean.xml')
    xsl_file = os.path.join(MY_PATH, '../xslt/extract-bibids.xsl')
    params = 'filename=' + in_file
    x = util.saxon_process(in_file, xsl_file, None, theParams=params)
    assert 'BibID: 4078817' in x
Exemple #2
0
def check_clio(date, filepath):

    # Get a list of BIBIDs from stylesheet
    x = util.saxon_process(filepath, XSLT_PATH, None)
    the_deltas = x.split(',')
    print(the_deltas)

    if len(the_deltas) < 1:
        print("No bibids found in " + str(filepath) +
              ". Bypassing CLIO check.")
        quit()

    # Check to see if the datestamp in the 005 field matches the date from the delta update.

    # Allow a couple of retries, as some MARC records are very large and
    # may not be loadable by http.
    retry_max = 2
    retries = 0
    # Choose one random one to look up
    bibid = random.choice(the_deltas)
    the_bibids_tried = []

    while retries < retry_max:

        while bibid in the_bibids_tried:
            bibid = random.choice(the_deltas)
        the_bibids_tried.append(bibid)
        # print(bibid)
        # print(retries)
        try:
            datestamp = read_005(bibid)
            if datestamp == date:
                return True
            print("WARNING: 005 data for " + str(bibid) + " (" + datestamp +
                  ") does not match " + str(date))
            return False
            # retries = retry_max
        except Exception as e:
            if "request error" in str(e):
                retries += 1
                raise Exception(
                    "CLIO error: Could not verify that datestamps have been updated! "
                    + str(e))
Exemple #3
0
def main():
    my_name = __file__

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d"))

    storage_dir = "/cul/cul0/ldpd/archivesspace/"

    # saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar')

    # XSLT for transformation. Accepts a path in param in which to save the html snippet files.
    xsl_filename = '../xslt/generate_browse_list.xsl'

    xsl_path = os.path.join(my_path, xsl_filename)

    # Use the OAI file from previous day as source to generate lists.
    input_filename = yest_str + ".asAllRaw.xml"

    input_path = storage_dir + "oai/" + input_filename

    print("Input file: " + input_path)

    # The location for the stylesheet to save output documents.
    output_path = storage_dir + "fa_lists"
    # output_path = storage_dir + "test"  # test

    print("Output location: " + output_path)

    # output_path = os.path.join(my_path, 'output/fa_lists')  # test

    params = "output_dir=" + output_path

    # x = util.saxon_process(input_path, xsl_path, None, params)
    x = util.saxon_process(input_path, xsl_path, None, theParams=params)
    print(x)
def main():

    MY_NAME = __file__
    global SCRIPT_NAME
    SCRIPT_NAME = os.path.basename(MY_NAME)

    # This makes sure the script can be run from any working directory and still find related files.
    MY_PATH = os.path.dirname(__file__)

    sheet_id = "1Ltf5_hhR-xN4YSvNWmPX8bqJA1UjqAaSjgeHBr_5chA"

    parse_sheet = dataSheet(sheet_id, "parse!A:Z")  # Test
    validation_sheet = dataSheet(sheet_id, "schema!A:Z")  # Test
    eval_sheet = dataSheet(sheet_id, "eval!A:Z")  # Test

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    ################################
    #
    # Rsync files from web application to storage directory
    #
    ################################

    print("====== Syncing files from production cache... ======")
    print(" ")

    fromPath = "ldpdserv@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache"
    toPath = "/cul/cul0/ldpd/archivesspace/"

    myOptions = "--exclude 'clio*' --exclude '*.txt'"

    x = util.rsync_process(fromPath, toPath, myOptions)
    print(x)

    print(" ")

    ################################
    #
    # Perform validation reporting
    #
    ################################

    schema_path = os.path.join(MY_PATH, "../schemas/cul_as_ead.rng")

    csv_out_path = os.path.join(MY_PATH, "temp_out.txt")

    xslt_path = os.path.join(MY_PATH, "../schemas/cul_as_ead.xsl")

    data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache"
    # data_folder = "/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/ead_rsync_test"  # test
    # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead'  # for testing

    # Use in notification email to distinguish errors/warnings
    icons = {
        "redx": "\U0000274C",
        "exclamation": "\U00002757",
        "warning": "\U000026A0\U0000FE0F",
        "qmark": "\U00002753",
    }

    # check for malformed xml. If there is, then don't do further validation because it will fail once it hits an unparseable file.

    print(" ")
    print("====== Checking well-formedness ... ======")

    parse_errs = []
    try:
        x = util.run_bash("xmllint " + data_folder + "/* --noout",
                          errorPrefix="PARSE")
        # print(x)
        log_it("All files well-formed.")

    except Exception as e:
        if "PARSEERROR" in str(e):
            parse_errs = [
                msg_parse(l, icons["redx"]) for l in str(e).splitlines()
                if "as_ead" in l
            ]

        parse_errs = clean_array(parse_errs)
        if parse_errs:
            for e in get_unique_bibid_all_errors(parse_errs):
                log_it(icons["redx"] + "PARSE ERROR: " + e)

    parse_err_cnt = get_unique_count(parse_errs)

    if parse_errs:

        log_it(
            "There were " + str(parse_err_cnt) +
            " unparseable records! Validation of files could not be completed. Fix syntax and run script again."
        )
        parse_sheet.clear()
        parse_sheet.appendData(parse_errs)
        quit()

    # No parsing errors, so proceed...
    parse_sheet.clear()

    print(" ")
    print("====== Validating files... ======")

    # Batch validate against RNG schema.
    x = util.jing_process_batch(data_folder, schema_path, "as_ead*")

    schema_errs = [
        msg_parse(l, icons["exclamation"]) for l in str(x).splitlines()
        if "as_ead" in l
    ]

    schema_err_cnt = get_unique_count(schema_errs)

    if schema_errs:
        for e in get_unique_bibid_all_errors(schema_errs):
            log_it(icons["exclamation"] + "VALIDATION ERROR: " + e)
    else:
        log_it("All files are valid.")

    validation_sheet.clear()
    validation_sheet.appendData(schema_errs)

    print(" ")
    print("====== Evaluating with XSLT ... ======")

    try:
        x = util.saxon_process(xslt_path,
                               xslt_path,
                               csv_out_path,
                               theParams="filePath=" + data_folder)
        eval_sheet.clear()
        eval_sheet.importCSV(csv_out_path, delim="|")

    except Exception as e:
        if "SAXON ERROR" in str(e):
            print("Cancelled!")

    evals = eval_sheet.getDataColumns()[0]
    eval_bibs = set(evals)
    warnings_cnt = len(eval_bibs)

    if evals:
        log_it(icons["warning"] + " " + str(len(evals)) + " warnings in " +
               str(warnings_cnt) + " files.")
    else:
        log_it("There were no problems found!")

    the_tabs = validation_sheet.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)

    if "log" in the_tabs:
        log_range = "log!A:A"
        my_duration = str(now2 - now1)

        the_log = ("EADs from " + data_folder + " evaluated by " +
                   schema_path + " and " + xslt_path + ". Parse errors: " +
                   str(parse_err_cnt) + ". Schema errors: " +
                   str(schema_err_cnt) + ". XSLT warnings: " +
                   str(warnings_cnt) + ". Start: " + start_time +
                   ". Finished: " + end_time + " (duration: " + my_duration +
                   ").")

        # today = datetime.datetime.today().strftime('%c')
        dataSheet(validation_sheet.id, log_range).appendData([[the_log]])
    else:
        print("*** Warning: There is no log tab in this sheet. ***")

    print(" ")

    # print(the_log)

    log_it("Files with parse errors: " + str(parse_err_cnt))
    log_it("Files with schema errors: " + str(schema_err_cnt))
    log_it("Files with warnings: " + str(warnings_cnt))

    print(" ")

    exit_msg = ("Script done. Check report sheet for more details: " +
                validation_sheet.url)
    log_it(exit_msg)

    quit()
Exemple #5
0
x = the_sheet.lookup('4079432', 0, 1)

print(x)

print(' ')

print('testing archivesspace api...')

x = asf.getResource(2, 5907)

print(x)

print(' ')

print("testing saxon ...")

saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar')
source_dir = '/cul/cul0/ldpd/archivesspace/oai'
in_file = os.path.join(source_dir, '20201111.asClean.xml')
xsl_file = os.path.join(my_path, '../xslt/extract-bibids.xsl')
params = 'filename=' + in_file
x = util.saxon_process(in_file, xsl_file, None, theParams=params)
print(x)

print("This is a test!")

print("Yes it worked...")

#
Exemple #6
0
def main():

    # x = get_oapen_item(627426)
    # pprint(x)

    xml_dir = '/Users/dwh2128/Documents/SimplyE/books/Gutenberg/epub/'

    xslt_path = '/Users/dwh2128/Documents/SimplyE/books/Gutenberg/gutenberg_to_opds.xsl'

    output_folder = 'output/oa_clio/aaw/'

    sheet_id = '1aS2zZzDOAzr-LwNGjhxEofIfLBWIO0XM2Ft43Ec1amo'

    sheet_tab = 'AAW'
    # sheet_tab = 'Sheet1'
    # sheet_tab = 'Test'
    feed_stem = 'gutenberg_feed'
    collection_title = "Project Gutenberg EBooks | Columbia University Libraries"
    print('Extracting ' + sheet_tab + ' ... ')

    the_info = get_collection(sheet_id,
                              sheet_tab,
                              feed_stem,
                              collection_title,
                              multipart=False)

    # Divide list into chunks

    # chunk_size = 5
    chunk_size = 500
    total_count = len(the_info)
    print('Total count: ' + str(total_count))
    running_count = 0
    the_chunks = divide_list(the_info, chunk_size)

    for idx, record_chunk in enumerate(the_chunks):

        running_count += len(record_chunk)
        print('Running_count = ' + str(running_count))
        print('')
        page_no = idx + 1
        if page_no > 1:
            feed_name = feed_stem + '_p' + str(page_no) + '.xml'
            feed_list_name = feed_stem + '_list_p' + str(page_no) + '.xml'
        else:
            feed_name = feed_stem + '.xml'
            feed_list_name = feed_stem + '_list' + '.xml'

        # Add feed_next, only if it is not the last one
        if running_count < total_count:
            feed_next_name = feed_stem + '_p' + str(page_no + 1) + '.xml'
            feed_next_path = 'https://ebooks.library.columbia.edu/static-feeds/oa_clio/' + feed_next_name
        else:
            feed_next_name = ''
            feed_next_path = ''

        root = etree.Element("records")
        for r in record_chunk:
            rdf_path = xml_dir + str(r['id']) + '/pg' + str(r['id']) + '.rdf'
            # Look to verify that there is an RDF file to get data from.
            if os.path.exists(rdf_path):

                rec = etree.SubElement(root, "record")
                bibid = etree.SubElement(rec, "bibid")
                bibid.text = r['bibid']
                bookid = etree.SubElement(rec, "bookid")
                bookid.text = r['id']
            else:
                print("Warning: could not find RDF file for " + str(r['id']))

        # print(etree.tostring(root, pretty_print=True))
        list_file_path = 'output/' + feed_list_name
        with open(list_file_path, 'wb') as f:
            f.write(etree.tostring(root, pretty_print=True))

        # feed_file_name = feed_stem + '.xml'

        util.saxon_process(
            list_file_path,
            xslt_path,
            output_folder + feed_name,
            theParams=
            'feedURL=https://ebooks.library.columbia.edu/static-feeds/oa_clio/'
            + feed_name + ' feedNext=' + feed_next_path)

    val = validate_files(output_folder)

    the_errors = [f for f in val if f['errors']]
    if the_errors:
        print(the_errors)
    else:
        print("All files are valid!")

    quit()
Exemple #7
0
def main():

    asf.setServer('Prod')  # AS instance: Prod | Dev | Test

    mode = 'Prod'  # Prod or Test

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  # set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d"))

    ########################
    ### PROCESS OAI DATA ###
    ########################

    # Set path to Saxon processor
    # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar")

    # XSLT file to generate report
    marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl')

    if mode == 'Prod':
        # OAI XML file to use as source
        # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest'
        source_dir = '/cul/cul0/ldpd/archivesspace/oai'
        sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
        oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml'

    else:  # TEST
        yest_str = "20190915"
        # OAI XML file to use as source
        source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912'  # local test
        sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ'
        oai_file = yest_str + '.asAllRaw.xml'
    the_sheets = {
        'oai': dataSheet(sheet_id, 'oai!A:Z'),
        'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'),
        'log': dataSheet(sheet_id, 'log!A:Z')
    }

    the_outpath = os.path.join(my_path,
                               'output/' + yest_str + '.marc_reporter_out.xml')

    print(' ')

    # Copy oai current data to oai_last sheet for diff
    the_old_data = the_sheets['oai'].getData()
    the_sheets['oai_last'].clear()
    the_sheets['oai_last'].appendData(the_old_data)
    # Process OAI MARC and output to CSV
    util.saxon_process(oai_file, marc_xslt_file, the_outpath)

    # clear data from "new" sheet
    the_sheets['oai'].clear()

    # Send result csv to Google Sheet.
    y = the_sheets['oai'].importCSV(the_outpath, delim='|')

    print(' ')

    ########################
    ### PROCESS UNPUBLISHED ###
    ########################

    print('Finding unpublished records...')

    the_repos = [2, 3, 4, 5]
    the_fields = [
        'id', 'title', 'identifier', 'create_time', 'system_mtime',
        'last_modified_by', 'json'
    ]
    the_heads = [
        'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME',
        'SYSTEM_MTIME', 'LAST_MODIFIED_BY'
    ]

    unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z')

    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))

        x = asf.getUnpublished(r, filter='resources', fields=the_fields)
        # print(x)

        for a in x:
            row = [a[v] for v in the_fields]
            # print(row)
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except:
                call_no = ''
            # get the repo from the uri string.
            repo_id = int(str(row[0].split('/')[-3]).rstrip())
            # get the asid from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip())
            row.pop(0)
            row.insert(0, asid), row.insert(0, repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0, repo)
            the_unpublished.append(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    # print('Total unpublished: ' + str(len(the_unpublished)))
    msg = 'Total unpublished: ' + str(len(the_unpublished))
    print(msg)
    digester.post_digest(script_name, msg)  # Test

    unpubs_sheet.clear()
    unpubs_sheet.appendData([the_heads])
    unpubs_sheet.appendData(the_unpublished)

    ########################
    ### GET NEWLY CREATED ###
    ########################

    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_delta_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_heads = [
            'repo', 'asid', 'title', 'identifier', 'create_time',
            'system_mtime', 'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='ctime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                # print(row)
                # get the repo from the uri string.
                repo = str(row[0].split('/')[-3]).rstrip()
                # get the asid from the uri string.
                asid = str(row[0].split('/')[-1]).rstrip()
                row.pop(0)
                row.insert(0, asid), row.insert(0, repo)

                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))

        digester.post_digest(script_name, 'New ' + d['filter'] + ': ' +
                             str(len(the_modifieds)))  # Test
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_delta_sheet.appendData(the_modifieds)

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \
        '. Finished: ' + end_time + ' (duration: ' + my_duration + ').'

    the_sheets['log'].appendData([[the_log]])

    print(' ')

    print(the_log)

    digester.post_digest(script_name, the_log)  # Test

    print(' ')

    print('Script done. Updated data is available at ' + the_sheets['oai'].url)
Exemple #8
0
def main():

    MY_NAME = __file__
    global SCRIPT_NAME
    SCRIPT_NAME = os.path.basename(MY_NAME)

    # This makes sure the script can be run from any working directory and still find related files.
    MY_PATH = os.path.dirname(__file__)

    sheet_id = '1Ltf5_hhR-xN4YSvNWmPX8bqJA1UjqAaSjgeHBr_5chA'

    parse_sheet = dataSheet(sheet_id, 'parse!A:Z')  # Test
    validation_sheet = dataSheet(sheet_id, 'schema!A:Z')  # Test
    eval_sheet = dataSheet(sheet_id, 'eval!A:Z')  # Test

    # This is a dupe for other reporting
    # the_data_sheet2 = dataSheet(
    #     "198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY", "validation!A:Z")

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    ################################
    #
    # Rsync files from web application to storage directory
    #
    ################################

    print("====== Syncing files from production cache... ======")
    print(" ")

    # keyPath = "/home/ldpdserv/.ssh/id_dsa"
    fromPath = (
        "ldpdserv@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache"
    )
    toPath = "/cul/cul0/ldpd/archivesspace/"

    myOptions = "--exclude 'clio*'"

    x = util.rsync_process(fromPath, toPath, myOptions)
    print(x)

    print(" ")

    ################################
    #
    # Perform validation reporting
    #
    ################################

    schema_path = os.path.join(MY_PATH, "../schemas/cul_as_ead.rng")

    csv_out_path = os.path.join(MY_PATH, "temp_out.txt")

    xslt_path = os.path.join(MY_PATH, "../schemas/cul_as_ead2.xsl")  # test

    data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache"
    # data_folder = "/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/ead_rsync_test"  # test
    # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead'  # for testing

    # Use in notification email to distinguish errors/warnings
    icons = {
        "redx": "\U0000274C",
        "exclamation": "\U00002757",
        "warning": "\U000026A0\U0000FE0F",
        "qmark": "\U00002753",
    }

    # check for malformed xml. If there is, then don't do further validation because it will fail once it hits an unparseable file.

    print(" ")
    print("====== Checking well-formedness ... ======")

    parse_errs = []
    try:
        x = util.run_bash('xmllint ' + data_folder + '/* --noout',
                          errorPrefix='PARSE')
        # print(x)
        log_it("All files well-formed.")

    except Exception as e:
        if 'PARSEERROR' in str(e):
            parse_errs = [
                msg_parse(l, icons['redx']) for l in str(e).splitlines()
                if 'as_ead' in l
            ]

            # print(parse_errs)
        for e in get_unique_bibids(parse_errs):
            log_it(icons['redx'] + " " + str(e) + " has parsing errors.")

    parse_err_cnt = get_unique_count(parse_errs)

    if parse_errs:

        log_it(
            'There were ' + str(parse_err_cnt) +
            ' unparseable records! Validation of files could not be completed. Fix syntax and run script again.'
        )
        parse_sheet.clear()
        parse_sheet.appendData(parse_errs)
        quit()

    # No parsing errors, so proceed...
    parse_sheet.clear()

    print(" ")
    print("====== Validating files... ======")

    # Validate against schema. Xargs batches files so they won't exceed
    # limit on arguments with thousands of files.

    x = util.run_bash('find ' + data_folder +
                      ' -name "as_ead*"  | xargs -L 128 java -jar ' +
                      util.config['FILES']['jingPath'] + ' -d ' + schema_path,
                      errorPrefix='JING')

    schema_errs = [
        msg_parse(l, icons['exclamation']) for l in str(x).splitlines()
        if 'as_ead' in l
    ]

    schema_err_cnt = get_unique_count(schema_errs)

    if schema_errs:
        for e in get_unique_bibids(schema_errs):
            log_it(icons['exclamation'] + " " + str(e) +
                   " has validation errors.")
    else:
        log_it("All files are valid.")

    validation_sheet.clear()
    validation_sheet.appendData(schema_errs)

    print(" ")
    print("====== Evaluating with XSLT ... ======")

    try:
        x = util.saxon_process(xslt_path,
                               xslt_path,
                               csv_out_path,
                               theParams='filePath=' + data_folder)
        eval_sheet.clear()
        eval_sheet.importCSV(csv_out_path, delim='|')

    except Exception as e:
        if "SAXON ERROR" in str(e):
            print("Cancelled!")

    evals = eval_sheet.getDataColumns()[0]
    eval_bibs = set(evals)
    warnings_cnt = len(eval_bibs)

    if evals:
        log_it(icons['warning'] + " " + str(len(evals)) + " warnings in " +
               str(warnings_cnt) + " files.")
    else:
        log_it("There were no problems found!")

    the_tabs = validation_sheet.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)

    if "log" in the_tabs:
        log_range = "log!A:A"
        my_duration = str(now2 - now1)

        the_log = ("EADs from " + data_folder + " evaluated by " +
                   schema_path + " and " + xslt_path + ". Parse errors: " +
                   str(parse_err_cnt) + ". Schema errors: " +
                   str(schema_err_cnt) + ". XSLT warnings: " +
                   str(warnings_cnt) + ". Start: " + start_time +
                   ". Finished: " + end_time + " (duration: " + my_duration +
                   ").")

        # today = datetime.datetime.today().strftime('%c')
        dataSheet(validation_sheet.id, log_range).appendData([[the_log]])
    else:
        print("*** Warning: There is no log tab in this sheet. ***")

    print(" ")

    # print(the_log)

    log_it("Files with parse errors: " + str(parse_err_cnt))
    log_it("Files with schema errors: " + str(schema_err_cnt))
    log_it("Files with warnings: " + str(warnings_cnt))

    print(" ")

    exit_msg = "Script done. Check report sheet for more details: " + validation_sheet.url
    log_it(exit_msg)

    quit()
Exemple #9
0
def main():

    # Set to True to harvest complete set; otherwise will select based on date.
    HARVESTALL = False

    my_name = __file__
    my_path = os.path.dirname(__file__)
    script_name = os.path.basename(my_name)

    # calculate dates in format yyyymmdd
    today = datetime.date.today().strftime("%Y%m%d")
    yesterday = (datetime.date.today() -
                 datetime.timedelta(days=1)).strftime("%Y%m%d")

    destination_folder = "/cul/cul0/ldpd/archivesspace/oai"
    # destination_folder = "/cul/cul0/ldpd/archivesspace/test"  # test
    # destination_folder = "./"  # test
    xslt_path = os.path.join(my_path, "../xslt/cleanOAI.xsl")

    out_path_raw = os.path.join(destination_folder, today + ".asRaw.xml")
    out_path_raw_all = os.path.join(destination_folder,
                                    today + ".asAllRaw.xml")
    out_path_clean = os.path.join(destination_folder, today + ".asClean.xml")

    # Set server to Prod | Test | Dev
    server = "Prod"

    fromDate = yesterday

    # # Not using date, get all records and then filter with the XSLT!
    # date_params = ""

    # Select date interval for harvest
    # TODO: change this to be controlled by param file.

    if HARVESTALL == True:
        date_params = " "  # Use this to harvest all records.
    else:
        date_params = "-f " + yesterday

    # Harvest OAI-PMH data
    print("Harvesting data from OAI...")
    util.oai_harvest(out_path_raw, server=server, date_params=date_params)

    # Process through XSLT

    # TODO: change xsl to not require this param, if we are doing it in the harvest!
    time_offset = 'P800DT30H'

    saxon_params = " time_offset=" + time_offset

    print("Processing file with XSLT...")
    x = util.saxon_process(out_path_raw,
                           xslt_path,
                           out_path_clean,
                           theParams=saxon_params)
    print(x)
    digester.post_digest(script_name, x)

    print("Harvesting all records for reporting ...")
    date_params = " "
    util.oai_harvest(out_path_raw_all, server=server, date_params=date_params)

    # Remove old OAI files
    util.file_cleanup(destination_folder, 30)

    digester.post_digest(
        script_name, script_name + ' completed at ' +
        str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S')) + '.')
def main():

    report_level = "low"
    # 'low' = only parse/schema errors; 'high' = include schematron warnings

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    print("Script " + my_name + " begun at " + start_time + ". ")
    print(" ")

    ################################
    #
    # Rsync files from web application to storage directory
    #
    ################################

    print("====== Syncing files from production cache... ======")
    print(" ")

    keyPath = "/home/ldpdapp/.ssh/id_dsa"
    fromPath = (
        "ldpdapp@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache"
    )
    toPath = "/cul/cul0/ldpd/archivesspace/"

    myOptions = "--exclude 'clio*'"

    x = util.rsync_process(keyPath, fromPath, toPath, myOptions)
    print(x)

    print(" ")

    ################################
    #
    # Perform validation reporting
    #
    ################################

    print("====== Validating files... ======")
    print(" ")

    if report_level == "high":
        print('* Logging level: "' + report_level +
              '" — showing all errors and warnings. *')
    else:
        print(
            '* Logging level: "' + report_level +
            '" – showing only errors. Check report for complete results including warnings. *'
        )

    print(" ")

    # The Google Sheet to send data to
    the_data_sheet = dataSheet("1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0",
                               "validation!A:Z")

    # the_data_sheet = dataSheet(
    #     '1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0', 'test!A:Z')  # Test

    # This is a dupe for other reporting
    the_data_sheet2 = dataSheet("198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY",
                                "validation!A:Z")

    # Set path to saxon processor for evaluator xslt
    saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar')

    # Set path to schema validator (Jing)
    jing_path = os.path.join(my_path,
                             "../../resources/jing-20091111/bin/jing.jar")

    schema_filename = "schemas/cul_as_ead.rng"
    # schematron_filename = "schemas/cul_as_ead.sch"
    xslt_filename = "schemas/cul_as_ead.xsl"
    schema_path = os.path.join(my_path, schema_filename)
    xslt_path = os.path.join(my_path, xslt_filename)

    data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache"
    # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead'  # for testing

    # Use in notification email to distinguish errors/warnings
    icons = {
        "redx": "\U0000274C",  # use for parse errors
        "exclamation": "\U00002757",
        "warning": "\U000026A0\U0000FE0F",  # use for schema validation errors
        "qmark": "\U00002753",
    }

    # Load files from directory into a list
    the_file_paths = []
    for root, dirs, files in os.walk(os.path.abspath(data_folder)):
        for file in files:
            the_file_paths.append(os.path.join(root, file))

    # The column heads for the report spreadsheet
    the_heads = [
        "bibid",
        "file",
        "well-formed?",
        "valid?",
        "schema output",
        "schematron output",
        "warning type",
    ]

    the_results = []

    the_results.append(the_heads)

    # counters
    parse_errors = 0
    validation_errors = 0
    sch_warnings = 0

    for a_file in the_file_paths:
        the_file_data = []
        file_name = a_file.split("/")[-1]
        bibid = file_name.split("_")[-1].split(".")[0]

        validation_result = util.jing_process(jing_path, a_file, schema_path)

        if "fatal:" in validation_result:
            # It's a parsing error.
            err_msg = icons["redx"] + " FATAL ERROR: " + \
                file_name + " could not be parsed!"
            print(err_msg)
            digester.post_digest(script_name, err_msg)
            wf_status = False
            validation_status = False
            parse_errors += 1
        else:
            wf_status = True
            if "error:" in validation_result:
                # It's a validation error.
                validation_status = False
                err_msg = icons["warning"] + " ERROR: " + \
                    file_name + " contains validation errors."
                print(err_msg)
                digester.post_digest(script_name, err_msg)
                validation_errors += 1
            else:
                validation_status = True

        if validation_result:
            validation_result_clean = clean_output(validation_result,
                                                   incl_types=False)[0]
        else:
            validation_result_clean = validation_result

        if wf_status == False:
            schematron_result_clean = "-"
            warning_types = []

        else:

            # schematron_result = util.jing_process(
            #     jing_path, a_file, schematron_path)
            schematron_result = util.saxon_process(saxon_path, a_file,
                                                   xslt_path, None)

            if schematron_result:
                # It's a schematron violiation.
                if report_level == "high":
                    # Only show if required by reporting level var (use to filter out large numbers of warnings).
                    err_msg = "WARNING: " + file_name + " has Schematron rule violations."
                    print(err_msg)
                    digester.post_digest(script_name, err_msg)
                sch_warnings += 1

            if schematron_result:
                x = clean_output(schematron_result, incl_types=True)
                schematron_result_clean = x[0]
                warning_types = x[1]
            else:
                schematron_result_clean = ""
                warning_types = ""

        the_file_data = [
            bibid,
            file_name,
            wf_status,
            validation_status,
            validation_result_clean,
            schematron_result_clean,
            ", ".join(warning_types),
        ]

        the_results.append(the_file_data)

    the_data_sheet.clear()
    the_data_sheet.appendData(the_results)
    the_data_sheet2.clear()
    the_data_sheet2.appendData(the_results)

    # generate log and add to log tab, if exists.
    the_tabs = the_data_sheet.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("EADs from " + data_folder + " evaluated by " +
               schema_filename + " and " + xslt_filename + ". Parse errors: " +
               str(parse_errors) + ". Schema errors: " +
               str(validation_errors) + ". Schematron warnings: " +
               str(sch_warnings) + ". Start: " + start_time + ". Finished: " +
               end_time + " (duration: " + my_duration + ").")

    if "log" in the_tabs:
        log_range = "log!A:A"
        # today = datetime.datetime.today().strftime('%c')
        dataSheet(the_data_sheet.id, log_range).appendData([[the_log]])
    else:
        print("*** Warning: There is no log tab in this sheet. ***")

    print(" ")

    # print(the_log)

    print("Parse errors: " + str(parse_errors))
    digester.post_digest(script_name, "Parse errors: " + str(parse_errors))
    print("Schema errors: " + str(validation_errors))
    digester.post_digest(script_name,
                         "Schema errors: " + str(validation_errors))
    print("Schematron warnings: " + str(sch_warnings))
    digester.post_digest(script_name,
                         "Schematron warnings: " + str(sch_warnings))

    print(" ")

    exit_msg = "Script done. Check report sheet for more details: " + the_data_sheet.url
    print(exit_msg)
    digester.post_digest(script_name, exit_msg)

    quit()