Esempio n. 1
0
def main():

    my_name = __file__
    script_name = os.path.basename(my_name)

    saxon_path = '../../resources/saxon-9.8.0.12-he.jar'
    in_path = '~/ohac_marc.xml'
    xsl_path = './oral2solr.xsl'
    out_path = '~/ohac_marc_OUT.xml'

    digester.post_digest(script_name, "THIS IS A TEST")

    print(acfa.run_saxon(saxon_path, in_path, xsl_path, out_path))
Esempio n. 2
0
def ohac_extract():
    # extract_script_path = '/cul/cul0/ldpd/ccoh/fetchOralHistoryRecords'
    extract_script_path = os.path.join(my_path, './fetchOralHistoryRecords')
    # marc_output_path = '/cul/cul0/ldpd/archival_data/test/marc/ohac_marc.xml'  # Test
    marc_output_path = '/cul/cul0/ldpd/archival_data/marc/oral_history_portal/ohac_marc.xml'
    # marc_output_clean_path = '/cul/cul0/ldpd/archival_data/test/marc/ohac_marc_clean.xml'  # Test
    marc_output_clean_path = '/cul/cul0/ldpd/archival_data/marc/oral_history_portal/ohac_marc_clean.xml'
    # solr_output_path = '/cul/cul0/ldpd/archival_data/test/solr/ohac_solr.xml'  # Test
    solr_output_path = '/cul/cul0/ldpd/archival_data/solr/ohac_solr.xml'
    # saxon_path = os.environ['HOME'] + '/lib/saxon-9he.jar'
    saxon_path = '/opt/dcps/resources/saxon-9.8.0.12-he.jar'
    xslt_path = os.path.join(my_path, 'oral2solr.xsl')

    # remove existing file so fetchOralHistoryRecords won't fail.
    if os.path.exists(marc_output_path):
        print("Removing old file at " + marc_output_path)
        os.remove(marc_output_path)

    the_shell_command = extract_script_path + ' --output ' + marc_output_path

    print('Extracting OHAC MARC data from Voyager...')

    res = acfa.run_bash(the_shell_command)
    # print(res)
    if reporting:
        digester.post_digest(script_name, res)  # reporting

    # Do regex to remove some illegal characters. See ACFA-270.
    res = acfa.sanitize_xml(marc_output_path, marc_output_clean_path)
    if res:
        print(res)
        if reporting:
            digester.post_digest(script_name, res)  # reporting

    print('Transforming MARC to SOLR XML...')

    response = acfa.run_saxon(saxon_path, marc_output_clean_path, xslt_path,
                              solr_output_path)
    print(response)
    if "ERROR" in response:
        return []
    else:
        return [solr_output_path]
Esempio n. 3
0
def main():

    solr_index_envs = []
    if len(sys.argv) > 1:
        solr_index_envs = sys.argv[1].split(',')
    else:
        # Exit because there was no argument dev|test|prod.
        sys.exit("Error: No solr_index_env argument(s) provided!")

    # Only turn on digest reporting if running on Prod.
    global reporting
    if 'prod' in solr_index_envs:
        print("Reporting == True!")
        reporting = True

    solr_update_urls = [
        "http://ldpd-solr-" + solr_index_env +
        "1.cul.columbia.edu:8983/solr/archives_portal/update"
        for solr_index_env in solr_index_envs
    ]
    for solr_xml_path in archival_collections_extract():
        for solr_update_url in solr_update_urls:
            acfa.run_post(solr_xml_path, solr_update_url)
    for solr_xml_path in ohac_extract():
        for solr_update_url in solr_update_urls:
            acfa.run_post(solr_xml_path, solr_update_url)
    my_path = os.path.dirname(__file__)
    commit_xml_path = os.path.join(my_path, 'commit.xml')
    delete_xml_path = os.path.join(my_path, 'delete-delta.xml')
    for solr_update_url in solr_update_urls:
        # commit the document add/updates
        acfa.run_post(commit_xml_path, solr_update_url)
        # delete everything that wasn't added/updated in this job
        acfa.run_post(delete_xml_path, solr_update_url)
        # commit the deletes
        acfa.run_post(commit_xml_path, solr_update_url)

    if reporting:
        digester.post_digest(
            script_name, script_name + ' completed at ' +
            str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S')) + '.')
Esempio n. 4
0
def archival_collections_extract():
    # marc_data_folder = '/cul/cul0/ldpd/archival_data/test/marc/archives_portal'
    # solr_output_folder = '/cul/cul0/ldpd/archival_data/test/solr'
    marc_data_folder = '/cul/cul0/ldpd/archival_data/marc/archives_portal'
    solr_output_folder = '/cul/cul0/ldpd/archival_data/solr'
    # saxon_path = os.environ['HOME'] + '/lib/saxon-9he.jar'
    saxon_path = '/opt/dcps/resources/saxon-9.8.0.12-he.jar'
    xslt_path = os.path.join(my_path, 'marc2solr.xsl')

    the_repos = [{
        'data_file': 'AV.xml',
        'clean_file': 'AV_clean.xml',
        'repo_id': 'nnc-a'
    }, {
        'data_file': 'EA.xml',
        'clean_file': 'EA_clean.xml',
        'repo_id': 'nnc-ea'
    }, {
        'data_file': 'HS.xml',
        'clean_file': 'HS_clean.xml',
        'repo_id': 'nnc-m'
    }, {
        'data_file': 'CCOH.xml',
        'clean_file': 'CCOH_clean.xml',
        'repo_id': 'nnc-ccoh'
    }, {
        'data_file': 'RB.xml',
        'clean_file': 'RB_clean.xml',
        'repo_id': 'nnc-rb'
    }, {
        'data_file': 'UA.xml',
        'clean_file': 'UA_clean.xml',
        'repo_id': 'nnc-ua'
    }, {
        'data_file': 'UT.xml',
        'clean_file': 'UT_clean.xml',
        'repo_id': 'nnc-ut'
    }]

    transform_paths = []
    # error_pattern = re.compile("^SAXON ERROR")
    for r in the_repos:
        raw_file_path = marc_data_folder + '/' + r['data_file']
        clean_file_path = marc_data_folder + '/' + r['clean_file']
        repo_id = r['repo_id']
        out_path = solr_output_folder + '/' + repo_id + '_solr' + '.xml'
        the_params = 'repo=' + repo_id

        repo_msg = 'Processing ' + r['data_file'] + '...'
        print(repo_msg)
        if reporting:
            digester.post_digest(script_name, repo_msg)  # reporting

        # strip out bad characters if any. See ACFA-270.
        res = acfa.sanitize_xml(raw_file_path, clean_file_path)
        if res:
            print(res)
            if reporting:
                digester.post_digest(script_name, res)  # reporting

        # transform to solr xml
        response = acfa.run_saxon(saxon_path,
                                  clean_file_path,
                                  xslt_path,
                                  out_path,
                                  theParams=the_params)

        print(response)
        if reporting:
            digester.post_digest(script_name, response)  # reporting
        if "ERROR" not in response:
            transform_paths.append(out_path)
    return transform_paths
Esempio n. 5
0
def log_it(script, log):
    if DEBUG is not True:
        digester.post_digest(script, log)
Esempio n. 6
0
            d = ""
        the_row.append(d)
    # print(the_row)

    # Handle subclassifications
    if 'terms' in s:
        the_terms = s['terms']
        for t in the_terms:
            the_row.append(t['term'] + ' [' + t['term_type'] + ']')

    the_output.append(the_row)

the_sheet.clear()
save = the_sheet.appendData(the_output)
print(save)
digester.post_digest(script_name,
                     "Total subject records: " + str(len(the_output) - 1))

# Generate log

now2 = datetime.datetime.now()
end_time = str(now2)
my_duration = str(now2 - now1)

the_log = (str(subj_cnt) + " subject records imported by " + my_name + ". " +
           " Start: " + start_time + ". Finished: " + end_time +
           " (duration: " + my_duration + ").")

log_range = "log!A:A"
log_sheet = dataSheet(sheet_id, log_range)

log_sheet.appendData([[the_log]])
Esempio n. 7
0
def main():

    # set to True to use test sheet and test json folder location.
    debug = False

    asf.setServer("Prod")

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ""  # set later
    today_str = str((date.today()).strftime("%Y%m%d"))

    if debug:
        print("[Running script in debug mode...]")
        parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources"  # test folder
        sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U"  # test sheet
        the_repos = [4]  # to test
    else:
        parent_folder = "/cul/cul0/ldpd/archivesspace/resources"
        sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg"
        the_repos = [2, 3, 4, 5, 6]

    output_folder = parent_folder + "/" + today_str

    the_sheets = {
        "resources": dataSheet(sheet_id, "Resources!A:Z"),
        "cm": dataSheet(sheet_id, "Collection Management!A:Z"),
        "log": dataSheet(sheet_id, "log!A:Z"),
    }

    # Set number of chars to truncate the scope and bioghist notes.
    trunc_len = 400

    # List of fields to extract, expressed as dpaths.
    the_fields = [
        ["bibid", "/id_0"],
        ["title", "/title"],
        ["published", "/publish"],
        ["create_time", "/create_time"],
        ["system_mtime", "/system_mtime"],
        ["created_by", "/created_by"],
        ["last_modified_by", "/last_modified_by"],
        ["ead_location", "/ead_location"],
        ["ext_number", "/extents/0/number"],
        ["ext_portion", "/extents/0/portion"],
        ["ext_type", "/extents/0/extent_type"],
        # ["integer_1", "/user_defined/integer_1"],
        # ["integer_2", "/user_defined/integer_2"],
        # ["integer_3", "/user_defined/integer_3"],
        ["local call no.", "/user_defined/string_1"],
        ["other ctrl no. 1", "/user_defined/string_2"],
        ["other ctrl no. 2", "/user_defined/string_3"],
        ["other ctrl no. 3", "/user_defined/string_4"],
        # ["enum_1", "/user_defined/enum_1"],
        # ["enum_2", "/user_defined/enum_2"],
        ["description status", "/user_defined/enum_3"],
        ["collecting area", "/user_defined/enum_4"],
        ["level", "level"]
        # (Scope and bioghist notes are added in separately below.)
    ]

    # Get the collection management records for use in report.

    the_cms = []

    fields = [
        "id",
        "parent_id",
        "title",
        "system_mtime",
        "processing_priority",
        "processing_status",
    ]

    print(" ")
    print("*** Retrieve Collection Management Data ***")
    print(" ")

    for r in the_repos:
        print("Getting collection management records for repo: " + str(r) +
              "...")
        cm = asf.getCollectionManagements(r, filter="resource", fields=fields)
        for c in cm:
            row = [c[f] for f in fields]
            the_cms.append(row)

    # a data set of collection managment records to post to sheet below.
    the_cms.insert(0, fields)

    print(" ")
    print("*** Retrieve Resource Data ***")
    print(" ")

    # Get the list of resources for each repo and add to the_ids
    the_ids = []
    for r in the_repos:
        print("Getting ids for repo: " + str(r) + "...")
        asids = json.loads(
            asf.getResponse("/repositories/" + str(r) +
                            "/resources?all_ids=true"))

        print(str(len(asids)) + " records found in repo " + str(r) + ".")
        for i in asids:
            the_ids.append([r, i])

    # Construct the head row
    the_heads = [x[0] for x in the_fields]
    the_heads.insert(0, "asid")
    the_heads.insert(0, "repo")
    the_heads.append("scope note")
    the_heads.append("scopenote length")

    the_heads.append("bioghist note")
    the_heads.append("biognote length")

    the_output = [the_heads]

    # Fetch the resources from the ids
    print("Downloading resources...")

    if not os.path.exists(output_folder):
        print("Creating directory " + output_folder + "...")
        os.makedirs(output_folder)

    for repo, asid in the_ids:
        # print("Processsing " + str(repo) + ":" + str(asid) + "...")
        the_row = [repo, asid]
        res_json = asf.getResource(repo, asid)
        res_dict = json.loads(res_json)

        out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json"

        # Write the JSON to file.
        with open(out_path, "w+") as f:
            f.write(res_json)

        # Use dpath to extract values from dict and compose into rows.
        for af in the_fields:
            try:
                d = str(dpath.util.get(res_dict, af[1]))
            except:
                d = ""
            the_row.append(d)

        # Process scope and bioghist notes

        the_notes = dpath.util.values(res_dict, "notes/*", afilter=None)

        the_scope_notes = []
        the_biog_notes = []

        for a_note in the_notes:
            try:
                if a_note["type"] == "scopecontent":
                    the_scope_notes.append(a_note)
            except:
                pass
            try:
                if a_note["type"] == "bioghist":
                    the_biog_notes.append(a_note)
            except:
                pass

        if the_scope_notes:
            # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars.
            scope_note_texts = [
                s["subnotes"][0]["content"] for s in the_scope_notes
            ]
            the_scope_text = " ".join(scope_note_texts)
            scope_note_len = len(the_scope_text)

            scope_note_short = truncate_str(the_scope_text, length=trunc_len)
        else:
            scope_note_short = ""
            scope_note_len = 0

        if the_biog_notes:
            # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars.
            biog_note_texts = [
                s["subnotes"][0]["content"] for s in the_biog_notes
            ]
            the_biog_text = " ".join(biog_note_texts)
            biog_note_len = len(the_biog_text)

            biog_note_short = truncate_str(the_biog_text, length=trunc_len)
        else:
            biog_note_short = ""
            biog_note_len = 0

        the_row.append(scope_note_short)
        the_row.append(str(scope_note_len))
        the_row.append(biog_note_short)
        the_row.append(str(biog_note_len))

        the_output.append(the_row)

    # Zip up the JSON files for storage.
    zip_out = make_archive(today_str,
                           "zip",
                           root_dir=parent_folder,
                           base_dir=today_str)

    print(zip_out)

    # Zip is saved in working dir; move to correct location.
    print("Saving zip file " + str(today_str) + ".zip to " + parent_folder)

    # Test if file already exists.
    if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"):
        print("File " + parent_folder + "/" + str(today_str) +
              ".zip exists already. Replacing with new zip file...")

        os.remove(parent_folder + "/" + str(today_str) + ".zip")

    move(zip_out, parent_folder)

    # Remove the json folder once zip is in place.
    rmtree(parent_folder + "/" + today_str)

    util.file_cleanup(parent_folder, 60)

    # Write output to Google sheet.

    print(" ")
    print("*** Writing Data to Report ***")
    print(" ")

    the_sheets["cm"].clear()
    the_sheets["cm"].appendData(the_cms)
    digester.post_digest(
        script_name,
        "Total collection management records: " + str(len(the_cms) - 1))

    the_sheets["resources"].clear()
    the_sheets["resources"].appendData(the_output)
    digester.post_digest(
        script_name,
        "Total number of resource records: " + str(len(the_output) - 1))

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("Data imported by " + my_name + ". Start: " + start_time +
               ". Finished: " + end_time + " (duration: " + my_duration + ").")

    the_sheets["log"].appendData([[the_log]])

    print(" ")

    print(the_log)

    print(" ")

    exit_msg = "Script done. Updated data is available at " + \
        the_sheets["resources"].url
    print(exit_msg)
    digester.post_digest(script_name, exit_msg)
Esempio n. 8
0
def log_it(msg):
    print(msg)
    digester.post_digest(SCRIPT_NAME, msg)
Esempio n. 9
0
def main():

    asf.setServer('Prod')  # AS instance: Prod | Dev | Test

    mode = 'Prod'  # Prod or Test

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  # set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d"))

    ########################
    ### PROCESS OAI DATA ###
    ########################

    # Set path to Saxon processor
    # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar")

    # XSLT file to generate report
    marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl')

    if mode == 'Prod':
        # OAI XML file to use as source
        # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest'
        source_dir = '/cul/cul0/ldpd/archivesspace/oai'
        sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
        oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml'

    else:  # TEST
        yest_str = "20190915"
        # OAI XML file to use as source
        source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912'  # local test
        sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ'
        oai_file = yest_str + '.asAllRaw.xml'
    the_sheets = {
        'oai': dataSheet(sheet_id, 'oai!A:Z'),
        'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'),
        'log': dataSheet(sheet_id, 'log!A:Z')
    }

    the_outpath = os.path.join(my_path,
                               'output/' + yest_str + '.marc_reporter_out.xml')

    print(' ')

    # Copy oai current data to oai_last sheet for diff
    the_old_data = the_sheets['oai'].getData()
    the_sheets['oai_last'].clear()
    the_sheets['oai_last'].appendData(the_old_data)
    # Process OAI MARC and output to CSV
    util.saxon_process(oai_file, marc_xslt_file, the_outpath)

    # clear data from "new" sheet
    the_sheets['oai'].clear()

    # Send result csv to Google Sheet.
    y = the_sheets['oai'].importCSV(the_outpath, delim='|')

    print(' ')

    ########################
    ### PROCESS UNPUBLISHED ###
    ########################

    print('Finding unpublished records...')

    the_repos = [2, 3, 4, 5]
    the_fields = [
        'id', 'title', 'identifier', 'create_time', 'system_mtime',
        'last_modified_by', 'json'
    ]
    the_heads = [
        'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME',
        'SYSTEM_MTIME', 'LAST_MODIFIED_BY'
    ]

    unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z')

    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))

        x = asf.getUnpublished(r, filter='resources', fields=the_fields)
        # print(x)

        for a in x:
            row = [a[v] for v in the_fields]
            # print(row)
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except:
                call_no = ''
            # get the repo from the uri string.
            repo_id = int(str(row[0].split('/')[-3]).rstrip())
            # get the asid from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip())
            row.pop(0)
            row.insert(0, asid), row.insert(0, repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0, repo)
            the_unpublished.append(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    # print('Total unpublished: ' + str(len(the_unpublished)))
    msg = 'Total unpublished: ' + str(len(the_unpublished))
    print(msg)
    digester.post_digest(script_name, msg)  # Test

    unpubs_sheet.clear()
    unpubs_sheet.appendData([the_heads])
    unpubs_sheet.appendData(the_unpublished)

    ########################
    ### GET NEWLY CREATED ###
    ########################

    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_delta_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_heads = [
            'repo', 'asid', 'title', 'identifier', 'create_time',
            'system_mtime', 'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='ctime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                # print(row)
                # get the repo from the uri string.
                repo = str(row[0].split('/')[-3]).rstrip()
                # get the asid from the uri string.
                asid = str(row[0].split('/')[-1]).rstrip()
                row.pop(0)
                row.insert(0, asid), row.insert(0, repo)

                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))

        digester.post_digest(script_name, 'New ' + d['filter'] + ': ' +
                             str(len(the_modifieds)))  # Test
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_delta_sheet.appendData(the_modifieds)

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \
        '. Finished: ' + end_time + ' (duration: ' + my_duration + ').'

    the_sheets['log'].appendData([[the_log]])

    print(' ')

    print(the_log)

    digester.post_digest(script_name, the_log)  # Test

    print(' ')

    print('Script done. Updated data is available at ' + the_sheets['oai'].url)
        "name": "persons",
        "endpoint": "/agents/people",
    },
]

for i in the_info:
    print("Getting agents: " + i["name"])
    out_path = os.path.join(my_path, "output/agents_" + i["name"] + ".pickle")

    # Get a list of agent ids from API
    agents_list = json.loads(asf.getResponse(i["endpoint"] + "?all_ids=true"))

    agent_cnt_str = "Number of agents (" + \
        i['name'] + "): " + str(len(agents_list))
    print(agent_cnt_str)
    digester.post_digest(script_name, agent_cnt_str)

    cnt = 0

    agent_data = []

    # Loop through agent ids and get full record from API.
    for agent in agents_list:
        cnt += 1
        # print("COUNT: " + str(cnt))
        # print("Agent # " + str(agent))
        x = asf.getResponse(i["endpoint"] + "/" + str(agent))
        agent_data.append(json.loads(x))

    # Save data as pickle
    util.pickle_it(agent_data, out_path)
Esempio n. 11
0
def main():

    # Set to True to harvest complete set; otherwise will select based on date.
    HARVESTALL = False

    my_name = __file__
    my_path = os.path.dirname(__file__)
    script_name = os.path.basename(my_name)

    # calculate dates in format yyyymmdd
    today = datetime.date.today().strftime("%Y%m%d")
    yesterday = (datetime.date.today() -
                 datetime.timedelta(days=1)).strftime("%Y%m%d")

    destination_folder = "/cul/cul0/ldpd/archivesspace/oai"
    # destination_folder = "/cul/cul0/ldpd/archivesspace/test"  # test
    # destination_folder = "./"  # test
    xslt_path = os.path.join(my_path, "../xslt/cleanOAI.xsl")

    out_path_raw = os.path.join(destination_folder, today + ".asRaw.xml")
    out_path_raw_all = os.path.join(destination_folder,
                                    today + ".asAllRaw.xml")
    out_path_clean = os.path.join(destination_folder, today + ".asClean.xml")

    # Set server to Prod | Test | Dev
    server = "Prod"

    fromDate = yesterday

    # # Not using date, get all records and then filter with the XSLT!
    # date_params = ""

    # Select date interval for harvest
    # TODO: change this to be controlled by param file.

    if HARVESTALL == True:
        date_params = " "  # Use this to harvest all records.
    else:
        date_params = "-f " + yesterday

    # Harvest OAI-PMH data
    print("Harvesting data from OAI...")
    util.oai_harvest(out_path_raw, server=server, date_params=date_params)

    # Process through XSLT

    # TODO: change xsl to not require this param, if we are doing it in the harvest!
    time_offset = 'P800DT30H'

    saxon_params = " time_offset=" + time_offset

    print("Processing file with XSLT...")
    x = util.saxon_process(out_path_raw,
                           xslt_path,
                           out_path_clean,
                           theParams=saxon_params)
    print(x)
    digester.post_digest(script_name, x)

    print("Harvesting all records for reporting ...")
    date_params = " "
    util.oai_harvest(out_path_raw_all, server=server, date_params=date_params)

    # Remove old OAI files
    util.file_cleanup(destination_folder, 30)

    digester.post_digest(
        script_name, script_name + ' completed at ' +
        str(datetime.datetime.now().strftime('%m/%d/%Y %H:%M:%S')) + '.')
Esempio n. 12
0
        acc_info.pop("extents")

        # Clean up titles
        acc_info['title'] = str(acc_info['title']).strip()

        # Uncomment to list records in log.
        # print("processing: " + str(acc_info["uri"]).strip() + ' / ' + str(acc_info["title"]).strip() )

        all_rows.append(acc_info)

    processed_msg = 'Processed ' + \
        str(len(all_rows)) + ' records in ' + repo_name + '.'
    print(processed_msg)

    digester.post_digest(script_name, processed_msg)

    # the_heads = list(all_rows[0].keys())

    # explicitly order the columns, as dict order is unpredictable.
    the_heads = [
        'title', 'uri', 'accession_date', 'id_0', 'id_1', 'id_2', 'id_3',
        'integer_1', 'resource_bibid', 'resource_asid', 'repo', 'year',
        'fiscal-year', 'ext-number', 'ext-portion', 'ext-type',
        'processing_priority', 'processing_status', 'create_time',
        'system_mtime', 'last_modified_by'
    ]

    the_output = []

    # Build row in order specified by the_heads
def main():

    report_level = "low"
    # 'low' = only parse/schema errors; 'high' = include schematron warnings

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    print("Script " + my_name + " begun at " + start_time + ". ")
    print(" ")

    ################################
    #
    # Rsync files from web application to storage directory
    #
    ################################

    print("====== Syncing files from production cache... ======")
    print(" ")

    keyPath = "/home/ldpdapp/.ssh/id_dsa"
    fromPath = (
        "ldpdapp@ldpd-nginx-prod1:/opt/passenger/ldpd/findingaids_prod/caches/ead_cache"
    )
    toPath = "/cul/cul0/ldpd/archivesspace/"

    myOptions = "--exclude 'clio*'"

    x = util.rsync_process(keyPath, fromPath, toPath, myOptions)
    print(x)

    print(" ")

    ################################
    #
    # Perform validation reporting
    #
    ################################

    print("====== Validating files... ======")
    print(" ")

    if report_level == "high":
        print('* Logging level: "' + report_level +
              '" — showing all errors and warnings. *')
    else:
        print(
            '* Logging level: "' + report_level +
            '" – showing only errors. Check report for complete results including warnings. *'
        )

    print(" ")

    # The Google Sheet to send data to
    the_data_sheet = dataSheet("1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0",
                               "validation!A:Z")

    # the_data_sheet = dataSheet(
    #     '1tQY9kR5YOh1e7i4dVRsl_GMxpNnUgCkb5X8qJQBAsG0', 'test!A:Z')  # Test

    # This is a dupe for other reporting
    the_data_sheet2 = dataSheet("198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY",
                                "validation!A:Z")

    # Set path to saxon processor for evaluator xslt
    saxon_path = os.path.join(my_path, '../../resources/saxon-9.8.0.12-he.jar')

    # Set path to schema validator (Jing)
    jing_path = os.path.join(my_path,
                             "../../resources/jing-20091111/bin/jing.jar")

    schema_filename = "schemas/cul_as_ead.rng"
    # schematron_filename = "schemas/cul_as_ead.sch"
    xslt_filename = "schemas/cul_as_ead.xsl"
    schema_path = os.path.join(my_path, schema_filename)
    xslt_path = os.path.join(my_path, xslt_filename)

    data_folder = "/cul/cul0/ldpd/archivesspace/ead_cache"
    # data_folder = '/cul/cul0/ldpd/archivesspace/test/ead'  # for testing

    # Use in notification email to distinguish errors/warnings
    icons = {
        "redx": "\U0000274C",  # use for parse errors
        "exclamation": "\U00002757",
        "warning": "\U000026A0\U0000FE0F",  # use for schema validation errors
        "qmark": "\U00002753",
    }

    # Load files from directory into a list
    the_file_paths = []
    for root, dirs, files in os.walk(os.path.abspath(data_folder)):
        for file in files:
            the_file_paths.append(os.path.join(root, file))

    # The column heads for the report spreadsheet
    the_heads = [
        "bibid",
        "file",
        "well-formed?",
        "valid?",
        "schema output",
        "schematron output",
        "warning type",
    ]

    the_results = []

    the_results.append(the_heads)

    # counters
    parse_errors = 0
    validation_errors = 0
    sch_warnings = 0

    for a_file in the_file_paths:
        the_file_data = []
        file_name = a_file.split("/")[-1]
        bibid = file_name.split("_")[-1].split(".")[0]

        validation_result = util.jing_process(jing_path, a_file, schema_path)

        if "fatal:" in validation_result:
            # It's a parsing error.
            err_msg = icons["redx"] + " FATAL ERROR: " + \
                file_name + " could not be parsed!"
            print(err_msg)
            digester.post_digest(script_name, err_msg)
            wf_status = False
            validation_status = False
            parse_errors += 1
        else:
            wf_status = True
            if "error:" in validation_result:
                # It's a validation error.
                validation_status = False
                err_msg = icons["warning"] + " ERROR: " + \
                    file_name + " contains validation errors."
                print(err_msg)
                digester.post_digest(script_name, err_msg)
                validation_errors += 1
            else:
                validation_status = True

        if validation_result:
            validation_result_clean = clean_output(validation_result,
                                                   incl_types=False)[0]
        else:
            validation_result_clean = validation_result

        if wf_status == False:
            schematron_result_clean = "-"
            warning_types = []

        else:

            # schematron_result = util.jing_process(
            #     jing_path, a_file, schematron_path)
            schematron_result = util.saxon_process(saxon_path, a_file,
                                                   xslt_path, None)

            if schematron_result:
                # It's a schematron violiation.
                if report_level == "high":
                    # Only show if required by reporting level var (use to filter out large numbers of warnings).
                    err_msg = "WARNING: " + file_name + " has Schematron rule violations."
                    print(err_msg)
                    digester.post_digest(script_name, err_msg)
                sch_warnings += 1

            if schematron_result:
                x = clean_output(schematron_result, incl_types=True)
                schematron_result_clean = x[0]
                warning_types = x[1]
            else:
                schematron_result_clean = ""
                warning_types = ""

        the_file_data = [
            bibid,
            file_name,
            wf_status,
            validation_status,
            validation_result_clean,
            schematron_result_clean,
            ", ".join(warning_types),
        ]

        the_results.append(the_file_data)

    the_data_sheet.clear()
    the_data_sheet.appendData(the_results)
    the_data_sheet2.clear()
    the_data_sheet2.appendData(the_results)

    # generate log and add to log tab, if exists.
    the_tabs = the_data_sheet.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("EADs from " + data_folder + " evaluated by " +
               schema_filename + " and " + xslt_filename + ". Parse errors: " +
               str(parse_errors) + ". Schema errors: " +
               str(validation_errors) + ". Schematron warnings: " +
               str(sch_warnings) + ". Start: " + start_time + ". Finished: " +
               end_time + " (duration: " + my_duration + ").")

    if "log" in the_tabs:
        log_range = "log!A:A"
        # today = datetime.datetime.today().strftime('%c')
        dataSheet(the_data_sheet.id, log_range).appendData([[the_log]])
    else:
        print("*** Warning: There is no log tab in this sheet. ***")

    print(" ")

    # print(the_log)

    print("Parse errors: " + str(parse_errors))
    digester.post_digest(script_name, "Parse errors: " + str(parse_errors))
    print("Schema errors: " + str(validation_errors))
    digester.post_digest(script_name,
                         "Schema errors: " + str(validation_errors))
    print("Schematron warnings: " + str(sch_warnings))
    digester.post_digest(script_name,
                         "Schematron warnings: " + str(sch_warnings))

    print(" ")

    exit_msg = "Script done. Check report sheet for more details: " + the_data_sheet.url
    print(exit_msg)
    digester.post_digest(script_name, exit_msg)

    quit()