Exemple #1
0
def main():

    asf.setServer('Prod')

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  #set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y-%m-%d"))

    sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='mtime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                print(row)
                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_sheet.appendData(the_modifieds)

    quit()
Exemple #2
0
def main():
    # Main code goes here.

    asf.setServer("Prod")

    output_folder = "output/resource_remove_links"
    the_lookup_csv = "id_lookup_prod.csv"
    bibid_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-161-remove-links/acfa-161-remove-links.txt"

    # Read a list of bibids (csv)
    the_bibids = []
    with open(bibid_file) as ids:
        for row in csv.reader(ids):
            the_bibids.append(row[0])

    for b in the_bibids:

        try:
            repo, asid = asf.lookupByBibID(b, the_lookup_csv)
            print("Processing " + str(b) + "...")

            out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_old.json")
            out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_new.json")

            x = asf.getResource(repo, asid)

            # Save copy of existing object
            print("Saving data to " + out_path_old + "....")
            with open(out_path_old, "w+") as f:
                f.write(x)

            x_dict = json.loads(x)
            print(x_dict["ead_location"])
            if "ead_location" in x_dict:
                del x_dict["ead_location"]
            else:
                print("No URL to delete!")

            y = json.dumps(x_dict)
            # print(y)

            post = asf.postResource(repo, asid, y)
            print(post)

            # Save copy of new object
            print("Saving data to " + out_path_new + "....")

            with open(out_path_new, "w+") as f:
                f.write(y)

        except:
            print("Error: Could not process " + str(b))
            print(sys.exc_info())
            # raise

    quit()
def main():
    # Test functions here.

    from pprint import pprint

    server = 'Test'
    asf.setServer(server)

    # The resource to scan
    the_resource = (4, 6288)

    # A place to put output of saved json objects (optional)
    output_folder = 'output/replace_extrefs'

    # Retrieve all archival objects under a given resource
    x = asf.getResponse('/repositories/' + str(the_resource[0]) +
                        '/resources/' + str(the_resource[1]) +
                        '/ordered_records')
    y = json.loads(x)['uris']

    # Select only the ones that are items or files, and add to a list
    the_refs = [r['ref'] for r in y if r['level'] in ['item', 'file']]

    cnt = 0

    for a_ref in the_refs:
        ref_decomposed = a_ref.split('/')
        repo, asid = ref_decomposed[2], ref_decomposed[4]

        ref_json = asf.getArchivalObject(repo, asid)

        out_path = output_folder + '/' + str(repo) + '_' + str(asid) + '.json'

        data_old = ref_json

        # The regex substitution
        repl = re.subn(r'<extref\s+type=\\"simple\\"\s+href=',
                       r'<extref xlink:type=\"simple\" xlink:href=',
                       ref_json,
                       flags=re.DOTALL)

        if repl[1] > 0:  # [1] is the count of replacements from subn
            # there is a change
            # Save copy of existing object
            print('Saving data to ' + out_path + '....')

            with open(out_path, "w+") as f:
                f.write(data_old)

            data_new = repl[0]
            cnt += 1
            print('Posting ' + str(repo) + '_' + str(asid) + ' to ' + server)
            z = asf.postArchivalObject(repo, asid, data_new)
            print(z)
            print(' ')

    print('Total replacements: ' + str(cnt))
def main():
    # SERVER = "Test"  # test
    SERVER = "Prod"
    asf.setServer(SERVER)

    LOOKUP = '/Users/dwh2128/Documents/git/dcps-utils/archivesspace/as_reports/id_lookup_prod.csv'

    sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI'
    read_sheet = dataSheet(sheet_id, 'TEST!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'Output!A:Z')

    the_data = read_sheet.getData()
    the_data.pop(0)

    # print(the_refs)

    the_output = []
    for r in the_data:
        bibid = r[0]
        repo = r[1]
        ref = r[2]
        extref_old = r[3]
        extref_new = r[5]
        the_res = json.loads(asf.getResourceByBibID(bibid, LOOKUP))
        # pprint(the_res)

        asid = the_res['uri'].split('/')[4]

        print("repo: " + str(repo) + "; asid: " + str(asid))

        the_notes = json.dumps(the_res['notes'])
        # print(the_notes)
        print(" ")

        the_new_notes = replace_notes(
            the_notes, [
                # fix problem of leading space in href
                {'find': 'xlink:href=\\" http',
                 'replace': 'xlink:href=\\"http'},
                # replace old url with new one
                {'find': extref_old,
                 'replace': extref_new}])

        # print(the_new_notes)

        the_res['notes'] = json.loads(the_new_notes)

        x = asf.postResource(repo, asid, json.dumps(the_res))
        out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)]
        print(out_row)
        the_output.append(out_row)

    # # write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()
Exemple #5
0
def main():

    asf.setServer('Prod')

    # the_repos=[2,3,4,5]
    the_repos=[2]
    the_fields = ['id','title','identifier','create_time','system_mtime','last_modified_by','json']

    the_sheet=dataSheet('198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY','unpublished!A:Z')


    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))
            
        x = getUnpublished(r,filter='resources',fields=the_fields)
        # print(x)

        for a in x:
            row = [ a[v] for v in the_fields ]
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except: 
                call_no = ''
            repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the repo from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip()) # get the asid from the uri string.
            row.pop(0)
            row.insert(0,asid), row.insert(0,repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0,repo)
            the_unpublished.append(row)
            print(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    print('Total unpublished: ' + str(len(the_unpublished)))

    # the_sheet.clear()
    # the_sheet.appendData([the_fields])
    # the_sheet.appendData(the_unpublished)


    quit()
Exemple #6
0
def main():
    SERVER = "Prod"  # test
    # SERVER = "Prod"
    asf.setServer(SERVER)

    sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI'
    read_sheet = dataSheet(sheet_id, 'TEST!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'Output!A:Z')

    the_data = read_sheet.getData()
    the_data.pop(0)

    # print(the_refs)

    the_output = []
    for r in the_data:
        repo = r[1]
        ref = r[2]
        extref_old = r[3]
        extref_new = r[5]
        the_ao = json.loads(asf.getArchivalObjectByRef(repo, ref))
        asid = the_ao['uri'].split('/')[4]

        print("asid: " + str(asid))

        the_notes = json.dumps(the_ao['notes'])

        # fix problem of leading space in href
        the_new_notes = the_notes.replace('xlink:href=\\" http',
                                          'xlink:href=\\"http')
        # replace old url with new one
        the_new_notes = the_new_notes.replace(extref_old, extref_new)

        print(the_new_notes)
        the_ao['notes'] = json.loads(the_new_notes)

        pprint(the_ao)

        x = asf.postArchivalObject(repo, asid, json.dumps(the_ao))
        out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)]
        print(out_row)
        the_output.append(out_row)

    # write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()
Exemple #7
0
def main():

    # set to Prod | Dev | Test
    asf.setServer('Prod')

    bibid_file = "ead_bibids_20190520.txt"
    lookup_file = "id_lookup_prod_20190522.csv"
    outfile_loc = "ead_as_qc_reports/ead_as_qc_xml_PROD1"

    with open(bibid_file) as f:
        the_bibids = [line.rstrip('\n') for line in f]

    the_errors = []
    the_processed = []

    for a_bibid in the_bibids:
        print('Processing bibid: ' + a_bibid)
        if a_bibid:
            try:
                the_lookup = asf.lookupByBibID(a_bibid, lookup_file)
                the_repo = the_lookup[0]
                the_asid = the_lookup[1]
                the_processed.append(a_bibid)
            except:
                # Can't find in lookup
                the_repo = 0
                the_asid = 0
                the_errors.append(a_bibid)

        if (a_bibid and the_asid != 0):
            the_ead = asf.getEAD(the_repo, the_asid)

            the_filepath = outfile_loc + '/' + a_bibid + '_ead.xml'

            with open(the_filepath, "w") as myfile:
                myfile.write(the_ead)

    # Report results
    print('Processed ' + str(len(the_processed)) + ' records.')
    if len(the_errors) > 0:
        print('*** Warning: ' + str(len(the_errors)) +
              ' errors. Could not process id ' + ', '.join(the_errors) +
              ' ***')
Exemple #8
0
def main():

    server = 'Prod'
    asf.setServer(server)

    enum_num = 14  # extent_extent_type enumeration
    extent_data = asf.getEnumeration(enum_num)

    extent_usage_csv = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-111-extents-cleanup/extent-values-prod3.tsv'

    output_folder = 'output/enumerations'

    # Paths for reporting before/after data
    out_path_old = output_folder + '/' + str(enum_num) + 'PROD_old.json'
    out_path_new = output_folder + '/' + str(enum_num) + 'PROD_new.json'

    # Save copy of existing object
    print('Saving data to ' + out_path_old + '....')
    with open(out_path_old, "w+") as f:
        f.write(extent_data)

    # Load list from csv
    csv.register_dialect('my_dialect', delimiter='\t', quoting=csv.QUOTE_NONE)
    data = []
    with open(extent_usage_csv) as the_csv_data:
        for row in csv.reader(the_csv_data, 'my_dialect'):
            data.append(row)

    # A list of ids of extent values to remove
    unused_extents = [x[0] for x in data if x[2] == 'Not used.']

    for e in unused_extents:
        print('suppressing ' + str(e))
        # mode='suppress' to suppress, mode='unsuppress' to unsuppress
        post = asf.suppressEnumerationValue(e, mode='suppress')
        print(post)

    extent_data_new = asf.getEnumeration(enum_num)

    # Save updated object
    print('Saving data to ' + out_path_new + '....')
    with open(out_path_new, "w+") as f:
        f.write(extent_data_new)
Exemple #9
0
def main():
    # Main code goes here.

    asf.setServer("Prod")

    lookup_csv = "id_lookup_prod.csv"
    id_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-226-oclc/035s_20200915.txt"

    # Read a list of bibids and oclc strings
    the_data = []
    with open(id_file) as ids:
        for row in csv.reader(ids, delimiter="|"):
            the_data.append([row[0], row[1], row[2]])

    for a_row in the_data:
        bibid = a_row[0]
        print(bibid)
        str_2 = a_row[1]
        str_3 = a_row[2]
        try:
            repo, asid = asf.lookupByBibID(bibid, lookup_csv)

            x = asf.getResource(repo, asid)
            y = json.loads(x)

            user_defnd = y["user_defined"] if "user_defined" in y else {}
            user_defnd["string_2"] = str_2
            user_defnd["string_3"] = str_3

            print(user_defnd)

            y["user_defined"] = user_defnd

            z = json.dumps(y)
            post = asf.postResource(repo, asid, z)
            print(post)

        except Exception as e:
            print(e + ": Could not lookup " + str(bibid))
Exemple #10
0
def main():
    # SERVER = "Test" # test
    SERVER = "Prod"
    asf.setServer(SERVER)

    sheet_id = '1OABHEJF1jqA1vlbW5yTENry5W7YqKlag5nJDJ9ouCzg'
    # read_sheet = dataSheet(sheet_id, 'Test!A:Z')  # Test
    read_sheet = dataSheet(sheet_id, 'Prod!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'output!A:Z')

    the_refs = read_sheet.getDataColumns()[0]
    # print(the_refs)

    the_output = []
    for r in the_refs:
        the_ao = json.loads(asf.getArchivalObjectByRef(2, r))
        asid = the_ao['uri'].split('/')[4]
        old_date = str(the_ao['dates'][0]['begin'])
        new_ao = fix_begin_date(2, the_ao)
        new_date = str(new_ao['dates'][0]['begin'])
        print("asid: " + str(asid))
        x = asf.postArchivalObject(2, asid, json.dumps(new_ao))
        out_row = [SERVER, r, asid, old_date, new_date, str(x)]
        # print(out_row)
        the_output.append(out_row)

    write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()

    x = fix_begin_date(2, 'b2ec9ce511e4212ebb145fb909ca85bd')
    print(x)

    pprint(
        json.loads(
            asf.getArchivalObjectByRef(2, 'b2ec9ce511e4212ebb145fb909ca85bd')))
    quit()
Exemple #11
0
def main():

    # set to True to use test sheet and test json folder location.
    debug = False

    asf.setServer("Prod")

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ""  # set later
    today_str = str((date.today()).strftime("%Y%m%d"))

    if debug:
        print("[Running script in debug mode...]")
        parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources"  # test folder
        sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U"  # test sheet
        the_repos = [4]  # to test
    else:
        parent_folder = "/cul/cul0/ldpd/archivesspace/resources"
        sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg"
        the_repos = [2, 3, 4, 5, 6]

    output_folder = parent_folder + "/" + today_str

    the_sheets = {
        "resources": dataSheet(sheet_id, "Resources!A:Z"),
        "cm": dataSheet(sheet_id, "Collection Management!A:Z"),
        "log": dataSheet(sheet_id, "log!A:Z"),
    }

    # Set number of chars to truncate the scope and bioghist notes.
    trunc_len = 400

    # List of fields to extract, expressed as dpaths.
    the_fields = [
        ["bibid", "/id_0"],
        ["title", "/title"],
        ["published", "/publish"],
        ["create_time", "/create_time"],
        ["system_mtime", "/system_mtime"],
        ["created_by", "/created_by"],
        ["last_modified_by", "/last_modified_by"],
        ["ead_location", "/ead_location"],
        ["ext_number", "/extents/0/number"],
        ["ext_portion", "/extents/0/portion"],
        ["ext_type", "/extents/0/extent_type"],
        # ["integer_1", "/user_defined/integer_1"],
        # ["integer_2", "/user_defined/integer_2"],
        # ["integer_3", "/user_defined/integer_3"],
        ["local call no.", "/user_defined/string_1"],
        ["other ctrl no. 1", "/user_defined/string_2"],
        ["other ctrl no. 2", "/user_defined/string_3"],
        ["other ctrl no. 3", "/user_defined/string_4"],
        # ["enum_1", "/user_defined/enum_1"],
        # ["enum_2", "/user_defined/enum_2"],
        ["description status", "/user_defined/enum_3"],
        ["collecting area", "/user_defined/enum_4"],
        ["level", "level"]
        # (Scope and bioghist notes are added in separately below.)
    ]

    # Get the collection management records for use in report.

    the_cms = []

    fields = [
        "id",
        "parent_id",
        "title",
        "system_mtime",
        "processing_priority",
        "processing_status",
    ]

    print(" ")
    print("*** Retrieve Collection Management Data ***")
    print(" ")

    for r in the_repos:
        print("Getting collection management records for repo: " + str(r) +
              "...")
        cm = asf.getCollectionManagements(r, filter="resource", fields=fields)
        for c in cm:
            row = [c[f] for f in fields]
            the_cms.append(row)

    # a data set of collection managment records to post to sheet below.
    the_cms.insert(0, fields)

    print(" ")
    print("*** Retrieve Resource Data ***")
    print(" ")

    # Get the list of resources for each repo and add to the_ids
    the_ids = []
    for r in the_repos:
        print("Getting ids for repo: " + str(r) + "...")
        asids = json.loads(
            asf.getResponse("/repositories/" + str(r) +
                            "/resources?all_ids=true"))

        print(str(len(asids)) + " records found in repo " + str(r) + ".")
        for i in asids:
            the_ids.append([r, i])

    # Construct the head row
    the_heads = [x[0] for x in the_fields]
    the_heads.insert(0, "asid")
    the_heads.insert(0, "repo")
    the_heads.append("scope note")
    the_heads.append("scopenote length")

    the_heads.append("bioghist note")
    the_heads.append("biognote length")

    the_output = [the_heads]

    # Fetch the resources from the ids
    print("Downloading resources...")

    if not os.path.exists(output_folder):
        print("Creating directory " + output_folder + "...")
        os.makedirs(output_folder)

    for repo, asid in the_ids:
        # print("Processsing " + str(repo) + ":" + str(asid) + "...")
        the_row = [repo, asid]
        res_json = asf.getResource(repo, asid)
        res_dict = json.loads(res_json)

        out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json"

        # Write the JSON to file.
        with open(out_path, "w+") as f:
            f.write(res_json)

        # Use dpath to extract values from dict and compose into rows.
        for af in the_fields:
            try:
                d = str(dpath.util.get(res_dict, af[1]))
            except:
                d = ""
            the_row.append(d)

        # Process scope and bioghist notes

        the_notes = dpath.util.values(res_dict, "notes/*", afilter=None)

        the_scope_notes = []
        the_biog_notes = []

        for a_note in the_notes:
            try:
                if a_note["type"] == "scopecontent":
                    the_scope_notes.append(a_note)
            except:
                pass
            try:
                if a_note["type"] == "bioghist":
                    the_biog_notes.append(a_note)
            except:
                pass

        if the_scope_notes:
            # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars.
            scope_note_texts = [
                s["subnotes"][0]["content"] for s in the_scope_notes
            ]
            the_scope_text = " ".join(scope_note_texts)
            scope_note_len = len(the_scope_text)

            scope_note_short = truncate_str(the_scope_text, length=trunc_len)
        else:
            scope_note_short = ""
            scope_note_len = 0

        if the_biog_notes:
            # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars.
            biog_note_texts = [
                s["subnotes"][0]["content"] for s in the_biog_notes
            ]
            the_biog_text = " ".join(biog_note_texts)
            biog_note_len = len(the_biog_text)

            biog_note_short = truncate_str(the_biog_text, length=trunc_len)
        else:
            biog_note_short = ""
            biog_note_len = 0

        the_row.append(scope_note_short)
        the_row.append(str(scope_note_len))
        the_row.append(biog_note_short)
        the_row.append(str(biog_note_len))

        the_output.append(the_row)

    # Zip up the JSON files for storage.
    zip_out = make_archive(today_str,
                           "zip",
                           root_dir=parent_folder,
                           base_dir=today_str)

    print(zip_out)

    # Zip is saved in working dir; move to correct location.
    print("Saving zip file " + str(today_str) + ".zip to " + parent_folder)

    # Test if file already exists.
    if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"):
        print("File " + parent_folder + "/" + str(today_str) +
              ".zip exists already. Replacing with new zip file...")

        os.remove(parent_folder + "/" + str(today_str) + ".zip")

    move(zip_out, parent_folder)

    # Remove the json folder once zip is in place.
    rmtree(parent_folder + "/" + today_str)

    util.file_cleanup(parent_folder, 60)

    # Write output to Google sheet.

    print(" ")
    print("*** Writing Data to Report ***")
    print(" ")

    the_sheets["cm"].clear()
    the_sheets["cm"].appendData(the_cms)
    digester.post_digest(
        script_name,
        "Total collection management records: " + str(len(the_cms) - 1))

    the_sheets["resources"].clear()
    the_sheets["resources"].appendData(the_output)
    digester.post_digest(
        script_name,
        "Total number of resource records: " + str(len(the_output) - 1))

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("Data imported by " + my_name + ". Start: " + start_time +
               ". Finished: " + end_time + " (duration: " + my_duration + ").")

    the_sheets["log"].appendData([[the_log]])

    print(" ")

    print(the_log)

    print(" ")

    exit_msg = "Script done. Updated data is available at " + \
        the_sheets["resources"].url
    print(exit_msg)
    digester.post_digest(script_name, exit_msg)
Exemple #12
0
# Requires pytest. Checks basic connectivity and read functions from sample data sheet.
# Run all tests with 'pytest --disable-pytest-warnings'.
# If in virtual environment, use 'python -m pytest --disable-pytest-warnings'.
import ASFunctions as asf
import json
# import logging

asf.setServer('Prod')


def test_get_resource_prod():
    x = json.loads(asf.getResource(2, 5907))
    assert x[
        'id_0'] == '4078601', "Prod: BIBID for resource 2:5907 should be 4078601"


asf.setServer('Test')


def test_get_resource_test():
    x = json.loads(asf.getResource(2, 5907))
    assert x[
        'id_0'] == '4078601', "Test: BIBID for resource 2:5907 should be 4078601"


asf.setServer('Dev')


def test_get_resource_dev():
    x = json.loads(asf.getResource(2, 5907))
    assert x[
def main():

    # Set to Test | Dev | Prod
    asf.setServer('Prod')

    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'notes!A:Z')

    id_file = 'replace_notes.csv'
    output_folder = 'output/notes'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    the_before_afters = []

    the_heads = [
        'repo', 'asid', 'uid', 'title', 'note_cnt1', 'note_cnt2', 'status'
    ]

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API
        print('getting data for ' + str(an_obj[0]) + ', ' + str(an_obj[1]))

        try:
            x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1])

            # Save copy of existing object
            print('Saving data to ' + out_path + '....')

            f = open(out_path, "w+")
            f.write(x)
            f.close()

            x = json.loads(x)

            asid = str(
                x['uri'].split('/')[-1])  # get the asid from the uri string.

            title = x['title']

            repo = str(an_obj[0])

            y = x

            my_notes_init = y['notes']
            my_notes_new = []

            if len(my_notes_init) > 0:
                if 'subnotes' in my_notes_init[0]:

                    for a_note in my_notes_init:
                        if 'subnotes' in a_note:
                            if 'extref' in a_note['subnotes'][0]['content']:
                                pass
                            else:
                                my_notes_new.append(a_note)

            if len(my_notes_new) == len(my_notes_init):
                the_status = "[no change]"
            else:
                the_status = "[deleted note]"

            y['notes'] = my_notes_new
            note_cnt1 = len(my_notes_init)
            note_cnt2 = len(y['notes'])

            the_before_afters.append([
                an_obj[0], asid, an_obj[1], title, note_cnt1, note_cnt2,
                the_status
            ])

            # convert dict back to json for posting.
            z = json.dumps(y)

            # Post the fixed object back to API.
            # (Comment these out for testing.)
            resp = asf.postArchivalObject(repo, asid, z)
            print(resp)

        except:
            print('Could not retrieve record ' + str(an_obj[1]))

    # Report changes to Google Sheet
    print('Writing before/after info to sheet...')

    the_report_sheet.clear()
    the_report_sheet.appendData(the_before_afters)

    print("Done!")

    quit()
Exemple #14
0
# Script to add authorities or make other changes to subjects. See ACFA-287.

import ASFunctions as asf
import json
from pprint import pprint
from sheetFeeder import dataSheet
import os.path


SERVER = 'Prod'
asf.setServer(SERVER)

my_name = __file__


# pprint(asf.getSubject(11453))
# quit()

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

sheet_id = '1b-dFdOaWD7AEqzhK0uuGXkonum6wX8Zcriq8-G4l33Q'

# list_sheet = dataSheet(sheet_id, 'Test!A:Z')  # test
list_sheet = dataSheet(sheet_id, 'batch!A:Z')
report_sheet = dataSheet(sheet_id, 'output!A:Z')


def add_authority(server, asid, uri, source=None):
    # function to (1) query subject and determine if it already has
    # an authority uri, (2) if not, add in the provided URI,
def main():

    # Set to Test | Dev | Prod
    asf.setServer('Prod')

    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'daos-prod!A:Z')

    id_file = 'replace_daos.csv'
    output_folder = 'output/daos/prod'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    the_before_afters = []

    the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after']

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API

        try:
            x = asf.getDigitalObjectFromParent(an_obj[0], an_obj[1])

            # Save copy of existing object
            print('Saving data to ' + out_path + '....')

            f = open(out_path, "w+")
            f.write(x)
            f.close()

            x = json.loads(x)

            the_old_field_data = x['file_versions'][0]['file_uri']

            asid = str(
                x['uri'].split('/')[-1])  # get the asid from the uri string.

            title = x['title']

            repo = str(an_obj[0])

            y = x

            y['file_versions'][0]['file_uri'] = re.sub(
                r"^(.*)-staging(.*)'$", r'\1\2',
                x['file_versions'][0]['file_uri'])

            if y['file_versions'][0]['file_uri'] == the_old_field_data:
                the_new_field_data = "[no change]"
            else:
                the_new_field_data = y['file_versions'][0]['file_uri']

            the_before_afters.append([
                an_obj[0], asid, an_obj[1], title, the_old_field_data,
                the_new_field_data
            ])

            # convert dict back to json for posting.
            z = json.dumps(y)

            # Post the fixed object back to API.
            # (Comment these out for testing.)
            if the_new_field_data != "[no change]":
                resp = asf.postDigitalObject(repo, asid, z)
                print(resp)
            else:
                print('No update: skipping record.')

        except:
            print('Could not retrieve record ' + str(an_obj[1]))

    # Report changes to Google Sheet
    print('Writing before/after info to sheet...')
    the_report_sheet.clear()
    the_report_sheet.appendData(the_before_afters)

    print("Done!")

    quit()
Exemple #16
0
def main():

    asf.setServer('Prod')

    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'ampersands!A:Z')

    id_file = 'archival_objects.csv'
    output_folder = 'output/archival_objects'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    # Search/replace patterns
    the_search_pattern = '&amp;amp;'
    the_replace_pattern = '&amp;'

    the_before_afters = []

    # the fields to perform regex replace on.
    the_fields = ['title', 'display_string']

    the_heads = ['repo', 'asid', 'uid', 'before', 'after']

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API
        x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1])

        # Save copy of existing object
        print('Saving data to ' + out_path + '....')

        f = open(out_path, "w+")
        f.write(x)
        f.close()

        x = json.loads(x)

        asid = str(
            x['uri'].split('/')[-1])  # get the asid from the uri string.
        repo = str(an_obj[0])

        the_initial_values = [
            str('{' + f + '_old:} ' + x[f]) for f in the_fields
        ]
        the_initial_values = "\n".join(the_initial_values)
        # print(the_initial_values)

        # TODO: function modifies x as well as y. Harmless but messy.
        y = regex_dict(x, the_fields, the_search_pattern, the_replace_pattern)

        the_new_values = [
            str('{' + f + '_new:} ' + y[f] + ' ') for f in the_fields
        ]
        the_new_values = "\n".join(the_new_values)

        the_before_afters.append(
            [repo, asid, an_obj[1], the_initial_values, the_new_values])

        # convert dict back to json for posting.
        z = json.dumps(y)

        # Post the fixed object back to API.
        # (Comment out these lines to test output without replacing.)
        post = asf.postArchivalObject(repo, asid, z)
        print(post)

    # Report changes to Google Sheet

    print('Writing before/after info to sheet...')
    the_report_sheet.clear()
    the_report_sheet.appendData(the_before_afters)

    print("Done!")
Exemple #17
0
# Automated reporting of ArchivesSpace accessions info.

import ASFunctions as asf
import json
from pprint import pprint
from sheetFeeder import dataSheet
from operator import itemgetter
import datetime
import re
import os.path
import dateutil.parser
import digester  # for generating composite digest of report info.

# set Prod | Dev | Test
target_server = 'Prod'  # Prod | Dev | Test
asf.setServer(target_server)

DEBUG = False
# mode = 'Prod'  # Prod or Test

MY_NAME = __file__
SCRIPT_NAME = os.path.basename(MY_NAME)

# This makes sure the script can be run from any working directory and still find related files.
MY_PATH = os.path.dirname(__file__)

# File to use to lookup bibids
LOOKUP_CSV = os.path.join(MY_PATH, "id_lookup_prod.csv")


def main():
def main():

    # Set value to switch to, publish (True) or unpublish (False)
    publish_value = False

    # Report changes to a spreadsheet?
    report_results = True

    asf.setServer('Prod')

    # A GSheet to post report to
    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'aos_unpub3!A:Z')

    # A CSV of format <repo>,<refid>
    id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-141-unpublish-archival-objects/unpublish_aos_series_IIIA_PROD_p7.csv'

    # A folder to put json objects for auditing purposes
    output_folder = 'output/unpubs3'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    the_before_afters = []

    the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after']

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API
        x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1])

        # Save copy of existing object
        print('Saving data to ' + out_path + '....')

        f = open(out_path, "w+")
        f.write(x)
        f.close()

        x = json.loads(x)

        asid = str(
            x['uri'].split('/')[-1])  # get the asid from the uri string.
        repo = str(an_obj[0])

        title = x['title']

        y = x
        old_value = x['publish']
        y['publish'] = publish_value
        new_value = y['publish']

        if new_value == old_value:
            new_value = '[no change]'

        the_before_afters.append(
            [repo, asid, an_obj[1], title, old_value, new_value])

        # convert dict back to json for posting.
        z = json.dumps(y)

        if new_value != "[no change]":
            resp = asf.postArchivalObject(repo, asid, z)
            print(resp)

        else:
            print('No update: skipping record.')

    # Report changes to Google Sheet

    if report_results == True:
        print('Writing before/after info to sheet...')
        the_report_sheet.clear()
        the_report_sheet.appendData(the_before_afters)

    print("Done!")
def main():

    asf.setServer('Prod')

    id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-147-hrw-access-restrictions/acfa-147-aos_UNVETTED.csv'
    output_folder = 'output/archival_objects_accessrestrict'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    access_types = {
        'unvetted': {
            'vocab': 'TEMPORARILY UNAVAILABLE',
            'text': '[Unvetted]'
        },
        'vetted': {
            'vocab': 'AVAILABLE',
            'text': '[Vetted, open]'
        }
    }

    # Set to 'vetted' or 'unvetted'
    the_type = 'unvetted'

    for an_obj in the_ids:
        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API
        x = asf.getArchivalObjectByRef(an_obj[0], an_obj[1])

        # Save copy of existing object
        print('Saving data to ' + out_path + '....')

        with open(out_path, "w+") as f:
            f.write(x)

        y = json.loads(x)

        asid = str(
            y['uri'].split('/')[-1])  # get the asid from the uri string.
        repo = str(an_obj[0])

        print('Processing ' + str(repo) + ' - ' + str(asid) + '...')

        the_notes = y['notes']

        # Test if there is already an accessrestrict
        has_accrestrict = False
        for an_item in the_notes:
            if an_item['type'] == 'accessrestrict':
                has_accrestrict = True

        if has_accrestrict == False:

            print('Adding access restrict note ...')

            the_access_note = {
                'jsonmodel_type':
                'note_multipart',
                'publish':
                True,
                'rights_restriction': {
                    'local_access_restriction_type':
                    [access_types[the_type]['vocab']]
                },
                'subnotes': [{
                    'content': access_types[the_type]['text'],
                    'jsonmodel_type': 'note_text',
                    'publish': True
                }],
                'type':
                'accessrestrict'
            }

            y['notes'].append(the_access_note)
            # the_notes = y['notes']

            z = json.dumps(y)

            # print(z)

            post = asf.postArchivalObject(repo, asid, z)
            print(post)

        else:
            print('Already has access restrict note. Skipping!')

    print("Done!")
Exemple #20
0
def main():

    # Set to Test | Dev | Prod
    asf.setServer('Prod')

    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'daos-publish!A:Z')

    # Set value to switch to, publish (True) or unpublish (False)
    publish_value = True

    # id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-mitchell.csv'
    id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-kay.csv'
    output_folder = 'output/daos-publish'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    the_before_afters = []

    the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after']

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API

        # try:
        x = asf.getDigitalObjectFromParent(an_obj[0], an_obj[1])

        # Save copy of existing object
        print('Saving data to ' + out_path + '....')

        f = open(out_path, "w+")
        f.write(x)
        f.close()

        x = json.loads(x)

        # the_old_field_data = x['file_versions'][0]['file_uri']
        the_old_field_data = x['publish']

        asid = str(
            x['uri'].split('/')[-1])  # get the asid from the uri string.

        title = x['title']

        repo = str(an_obj[0])

        y = x

        # Here set the desired value
        y['publish'] = publish_value

        if y['publish'] == the_old_field_data:
            the_new_field_data = "[no change]"
        else:
            the_new_field_data = y['publish']

        the_before_afters.append([
            an_obj[0], asid, an_obj[1], title, the_old_field_data,
            the_new_field_data
        ])

        # convert dict back to json for posting.
        z = json.dumps(y)

        # Post the fixed object back to API.
        # (Comment these out for testing.)
        if the_new_field_data != "[no change]":
            resp = asf.postDigitalObject(repo, asid, z)
            print(resp)
        else:
            print('No update: skipping record.')

        # except:
        #     print('Could not retrieve record ' + str(an_obj[1]))

    # Report changes to Google Sheet
    print('Writing before/after info to sheet...')
    the_report_sheet.clear()
    the_report_sheet.appendData(the_before_afters)

    print("Done!")

    quit()
Exemple #21
0
script_name = os.path.basename(my_name)

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

sheet_id = "1pZk2tPMuZDOd1veOBSJNRk2fprA6p3Qb3WKZDtZay88"
the_sheet = dataSheet(sheet_id, "subjects!A:Z")
# the_sheet = dataSheet(sheet_id, "test!A:Z") # test

now1 = datetime.datetime.now()
start_time = str(now1)
end_time = ""  # set later

# First get the subject records from API (this can take a long time!)

asf.setServer("Prod")  # AS instance: Prod | Dev | Test

# out_path = os.path.join(my_path, "output/subjects.pickle")
out_path = "/cul/cul0/ldpd/archivesspace/subjects/subjects.pickle"

# uncomment to do the full download.
the_subjects = asf.getSubjects()
util.pickle_it(the_subjects, out_path)

# Report the saved data to Google Sheet

# List of fields to extract, expressed as dpaths.
the_fields = [
    ["uri", "uri"],
    ["title", "title"],
    ["source", "source"],
Exemple #22
0
def main():

    asf.setServer("Prod")

    # the_lookup_csv = "id_lookup_TEST.csv"  # test
    the_lookup_csv = "id_lookup_prod.csv"  # test

    output_folder = "output/resource_language_encode"

    the_sheet = dataSheet("1eTPY7AbDvjDU-lzK2VQruvZAvlGkAJZglh2JrruPvdg", "Test6!A:Z")

    the_data = the_sheet.getData()

    the_new_data = []
    the_new_data.append(the_data.pop(0))

    counter = 0

    for a_row in the_data:

        counter += 1
        print(" ")
        print(counter)

        the_new_row = a_row
        the_bibid = a_row[0]
        the_041 = a_row[1]
        the_string = a_row[3]

        res_info = asf.lookupByBibID(the_bibid, the_lookup_csv)

        if res_info:
            out_path_old = (
                output_folder
                + "/"
                + str(res_info[0])
                + "_"
                + str(res_info[1])
                + "_old.json"
            )
            out_path_new = (
                output_folder
                + "/"
                + str(res_info[0])
                + "_"
                + str(res_info[1])
                + "_new.json"
            )

            # pull down the resource
            the_resource = asf.getResource(res_info[0], res_info[1])

            # Save copy of existing object
            print("Saving data to " + out_path_old + "....")

            with open(out_path_old, "w+") as f:
                f.write(the_resource)

            res_dict = json.loads(the_resource)

            langmaterials = res_dict["lang_materials"]

            # Collect encoded languages already present. There should be just one but not guaranteed, so make a list.
            primary_langs = []
            for n in langmaterials:
                try:
                    if n["language_and_script"]:
                        # print("YES")
                        primary_langs.append(n["language_and_script"]["language"])
                except:
                    print("Exception!")

            print("old:")
            print(primary_langs)

            print("new:")
            langs_parsed = language_lookup(the_string)
            print(langs_parsed)

            print("to add: ")
            langs_diff = diff(langs_parsed, primary_langs)
            print(langs_diff)

            if len(langs_diff) > 0:

                for l in langs_diff:
                    res_dict["lang_materials"].append(make_language_note(l))

                new_resource = json.dumps(res_dict)
                # Save new object
                print("Saving data to " + out_path_new + "....")

                with open(out_path_new, "w+") as f:
                    f.write(new_resource)

                # Post new resource back to API

                print("Posting data for " + str(res_info[0]) + " : " + str(res_info[1]))
                try:
                    post = asf.postResource(res_info[0], res_info[1], new_resource)
                    print(post)
                except:
                    print(
                        "Error: There was a problem posting resource "
                        + str(res_info[0])
                        + ":"
                        + str(res_info[1])
                        + "!"
                    )
                    langs_diff.append("[ERROR]")

            else:
                print("No new languages to add. Skipping.")

            the_new_row.append(",".join(langs_diff))
            the_new_data.append(the_new_row)

    the_sheet.clear()
    the_sheet.appendData(the_new_data)
def main():
    # Main code goes here.

    my_name = __file__

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    asf.setServer("Prod")

    the_sheet = dataSheet("1UQm7ffd1Kq4zqlzHZajd9YkwW1_nmOJFS1W7nI-c_Vk",
                          "new-batch!A:Z")
    output_folder = os.path.join(my_path, "output/resource_collecting_area")

    the_rows = the_sheet.getData()
    the_new_rows = []

    the_heads = the_rows.pop(0)

    the_new_rows.append(the_heads)

    coll_area_index = 8  # the column of collecting area

    for a_row in the_rows:
        the_new_row = a_row
        # print(a_row)
        coll = ""
        repo, asid = a_row[0], a_row[1]
        if len(a_row) >= coll_area_index:
            # if there is a collecting location to add
            coll = a_row[coll_area_index]

            the_resource = asf.getResource(repo, asid)

            out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_old.json")
            out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_new.json")

            # Save copy of existing object
            print("Saving data to " + out_path_old + "....")
            with open(out_path_old, "w+") as f:
                f.write(the_resource)

            the_data = json.loads(the_resource)

            fix = False
            if "user_defined" in the_data:
                the_user_defined = the_data["user_defined"]
                if "enum_4" in the_user_defined:
                    print("Already has enum_4! Skipping.")
                else:
                    fix = True
                    the_user_defined["enum_4"] = coll
                    the_data["user_defined"] = the_user_defined
                    the_new_resource = json.dumps(the_data)

                    # Save copy of new object
                    print("Saving data to " + out_path_new + "....")
                    with open(out_path_new, "w+") as f:
                        f.write(the_new_resource)

                if fix == True:

                    try:
                        post = "[NONE]"
                        post = asf.postResource(repo, asid, the_new_resource)
                        print(post)
                    except:
                        print("Error: There was a problem posting resource " +
                              str(repo) + ":" + str(asid) + "!")

                    the_new_row.append(coll)
            else:
                print("ERROR: No user_defined data in " + str(repo) + ":" +
                      str(asid))

        the_new_rows.append(the_new_row)

    the_sheet.clear()
    the_sheet.appendData(the_new_rows)

    # print(the_new_rows)

    quit()
Exemple #24
0
def main():
    # Main code goes here.

    asf.setServer("Prod")

    on_site = False
    # set to True to get on-site note, False to get off-site note. See the_access_note var below.

    output_folder = "output/resource_on-site_access"

    lookup_csv = "id_lookup_prod.csv"

    # bibid_file = (
    #     "/Users/dwh2128/Documents/ACFA/TEST/ACFA-224-onsite-notes/acfa-224-list_3.csv"
    # )
    bibid_file = (
        "/Users/dwh2128/Documents/ACFA/TEST/ACFA-243-off-site/acfa-243_off-site.csv"
    )

    # Read a list of bibids (csv)
    the_bibids = []
    with open(bibid_file) as ids:
        for row in csv.reader(ids):
            the_bibids.append(row[0])

    if on_site == True:
        the_access_note = {
            "jsonmodel_type": "note_multipart",
            "label": "Restrictions on Access",
            "type": "accessrestrict",
            "rights_restriction": {"local_access_restriction_type": []},
            "subnotes": [
                {
                    "jsonmodel_type": "note_text",
                    "content": "This collection is located on-site.",
                    "publish": True,
                }
            ],
            "publish": True,
        }
    else:
        the_access_note = {
            "jsonmodel_type": "note_multipart",
            "label": "Restrictions on Access",
            "type": "accessrestrict",
            "rights_restriction": {"local_access_restriction_type": []},
            "subnotes": [
                {
                    "jsonmodel_type": "note_text",
                    "content": "This collection is located off-site. You will need to request this material at least three business days in advance to use the collection in the Rare Book and Manuscript Library reading room.",
                    "publish": True,
                }
            ],
            "publish": True,
        }

    for bib in the_bibids:

        try:
            repo, asid = asf.lookupByBibID(bib, lookup_csv)
        except:
            print("Error: No record found for " + str(bib) + ". Skipping...")
            continue

        out_path_old = output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json"
        out_path_new = output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json"

        the_resource = asf.getResource(repo, asid)

        # Save copy of existing object
        print("Saving data to " + out_path_old + "....")

        with open(out_path_old, "w+") as f:
            f.write(the_resource)

        the_data = json.loads(the_resource)

        # Test if there is already an access restriction note.
        has_note = False
        for a_note in the_data["notes"]:
            try:
                if a_note["type"] == "accessrestrict":
                    has_note = True
            except KeyError:
                print("Note has no type -- skipping.")

        if has_note == True:
            print(str(bib) + " - Warning: Already has access note.")
        # else:
        the_data["notes"].append(the_access_note)

        the_new_resource = json.dumps(the_data)

        # Save copy of new object
        print("Saving data to " + out_path_new + "....")

        with open(out_path_new, "w+") as f:
            f.write(the_new_resource)

        try:
            post = asf.postResource(repo, asid, the_new_resource)
            print(post)
        except:
            print(
                "Error: There was a problem posting resource "
                + str(repo)
                + ":"
                + str(asid)
                + "!"
            )

    quit()
Exemple #25
0
# Script to get barcode and holding info from spreadsheet
# and add to top containers in ArchivesSpace via API. See ACFA-206.

import ASFunctions as asf
import json
from pprint import pprint
from sheetFeeder import dataSheet
import dcps_utils as util
import os.path
import csv
import datetime

asf.setServer('Prod')

my_name = __file__

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

# sheet_id = '1gUx1cPS8POLxqRblYIs1vlpr7yDGOyHmAJqpl6nMo4k'
sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0'  # test

# list_sheet = dataSheet(sheet_id, 'report!A:Z')
list_sheet = dataSheet(sheet_id, 'test!A:Z')  # test

the_data = list_sheet.getData()

the_heads = the_data.pop(0)

today = datetime.date.today().strftime("%Y-%m-%d")
Exemple #26
0
def main():

    asf.setServer('Prod')  # AS instance: Prod | Dev | Test

    mode = 'Prod'  # Prod or Test

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  # set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d"))

    ########################
    ### PROCESS OAI DATA ###
    ########################

    # Set path to Saxon processor
    # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar")

    # XSLT file to generate report
    marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl')

    if mode == 'Prod':
        # OAI XML file to use as source
        # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest'
        source_dir = '/cul/cul0/ldpd/archivesspace/oai'
        sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
        oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml'

    else:  # TEST
        yest_str = "20190915"
        # OAI XML file to use as source
        source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912'  # local test
        sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ'
        oai_file = yest_str + '.asAllRaw.xml'
    the_sheets = {
        'oai': dataSheet(sheet_id, 'oai!A:Z'),
        'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'),
        'log': dataSheet(sheet_id, 'log!A:Z')
    }

    the_outpath = os.path.join(my_path,
                               'output/' + yest_str + '.marc_reporter_out.xml')

    print(' ')

    # Copy oai current data to oai_last sheet for diff
    the_old_data = the_sheets['oai'].getData()
    the_sheets['oai_last'].clear()
    the_sheets['oai_last'].appendData(the_old_data)
    # Process OAI MARC and output to CSV
    util.saxon_process(oai_file, marc_xslt_file, the_outpath)

    # clear data from "new" sheet
    the_sheets['oai'].clear()

    # Send result csv to Google Sheet.
    y = the_sheets['oai'].importCSV(the_outpath, delim='|')

    print(' ')

    ########################
    ### PROCESS UNPUBLISHED ###
    ########################

    print('Finding unpublished records...')

    the_repos = [2, 3, 4, 5]
    the_fields = [
        'id', 'title', 'identifier', 'create_time', 'system_mtime',
        'last_modified_by', 'json'
    ]
    the_heads = [
        'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME',
        'SYSTEM_MTIME', 'LAST_MODIFIED_BY'
    ]

    unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z')

    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))

        x = asf.getUnpublished(r, filter='resources', fields=the_fields)
        # print(x)

        for a in x:
            row = [a[v] for v in the_fields]
            # print(row)
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except:
                call_no = ''
            # get the repo from the uri string.
            repo_id = int(str(row[0].split('/')[-3]).rstrip())
            # get the asid from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip())
            row.pop(0)
            row.insert(0, asid), row.insert(0, repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0, repo)
            the_unpublished.append(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    # print('Total unpublished: ' + str(len(the_unpublished)))
    msg = 'Total unpublished: ' + str(len(the_unpublished))
    print(msg)
    digester.post_digest(script_name, msg)  # Test

    unpubs_sheet.clear()
    unpubs_sheet.appendData([the_heads])
    unpubs_sheet.appendData(the_unpublished)

    ########################
    ### GET NEWLY CREATED ###
    ########################

    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_delta_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_heads = [
            'repo', 'asid', 'title', 'identifier', 'create_time',
            'system_mtime', 'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='ctime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                # print(row)
                # get the repo from the uri string.
                repo = str(row[0].split('/')[-3]).rstrip()
                # get the asid from the uri string.
                asid = str(row[0].split('/')[-1]).rstrip()
                row.pop(0)
                row.insert(0, asid), row.insert(0, repo)

                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))

        digester.post_digest(script_name, 'New ' + d['filter'] + ': ' +
                             str(len(the_modifieds)))  # Test
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_delta_sheet.appendData(the_modifieds)

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \
        '. Finished: ' + end_time + ' (duration: ' + my_duration + ').'

    the_sheets['log'].appendData([[the_log]])

    print(' ')

    print(the_log)

    digester.post_digest(script_name, the_log)  # Test

    print(' ')

    print('Script done. Updated data is available at ' + the_sheets['oai'].url)
Exemple #27
0
def main():

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    if DEBUG is True:
        sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8"  # test
    else:
        sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE"

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    # First get the agent records from API (this can take a long time!)

    asf.setServer("Prod")  # AS instance: Prod | Dev | Test

    if DEBUG is True:
        out_folder = "/cul/cul0/ldpd/archivesspace/test/agents"
    else:
        out_folder = "/cul/cul0/ldpd/archivesspace/agents"

    family_agents_file = os.path.join(out_folder, "agents_families.pickle")
    corp_agents_file = os.path.join(out_folder, "agents_corporate.pickle")
    persons_agents_file = os.path.join(out_folder, "agents_persons.pickle")

    the_info = [
        {
            "name": "families",
            "endpoint": "/agents/families",
            "sheet": dataSheet(sheet_id, "families!A:Z"),
            "pickle": family_agents_file
        },
        {
            "name": "corporate",
            "endpoint": "/agents/corporate_entities",
            "sheet": dataSheet(sheet_id, "corporate!A:Z"),
            "pickle": corp_agents_file
        },
        {
            "name": "persons",
            "endpoint": "/agents/people",
            "sheet": dataSheet(sheet_id, "persons!A:Z"),
            "pickle": persons_agents_file
        },
    ]

    # List of fields to extract, expressed as dpaths.
    the_fields = [
        ["uri", "uri"],
        ["title", "title"],
        ["source", "names/0/source"],
        ["authority_id", "names/0/authority_id"],
        ["is_linked_to_published_record", "is_linked_to_published_record"],
        ["publish", "publish"],
        ["last_modified_by", "last_modified_by"],
        ["last_modified", "system_mtime"],
    ]

    the_record_cnts = {}

    if DEBUG is True:
        print("*** (DEBUG MODE) ***")

    for i in the_info:
        print("Getting agents: " + i["name"])
        agent_data = get_agent_data(i["name"], i["endpoint"], i["pickle"])

        print(" ")

        # Report the saved data to Google Sheet

        the_sheet = i["sheet"]

        the_heads = [x[0] for x in the_fields]
        the_output = [the_heads]

        the_record_cnts[i["name"]] = str(len(agent_data))

        for agent in agent_data:
            the_row = []
            # Use dpath to extract values from dict and compose into rows.
            for af in the_fields:
                try:
                    d = str(dpath.util.get(agent, af[1]))
                except:
                    d = ""
                the_row.append(d)
            # print(the_row)
            the_output.append(the_row)

        the_sheet.clear()
        save = the_sheet.appendData(the_output)
        print(save)

    # Generate log

    print(the_record_cnts)
    print(" ".join(the_record_cnts))

    cnt_str = "".join(k + "=" + v + ". " for k, v in the_record_cnts.items())

    # print(cnt_str)

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("Data imported by " + MY_NAME + ". " + cnt_str + " Start: " +
               start_time + ". Finished: " + end_time + " (duration: " +
               my_duration + ").")

    log_range = "log!A:A"
    log_sheet = dataSheet(sheet_id, log_range)

    log_sheet.appendData([[the_log]])

    print(" ")

    print(the_log)
    log_it(SCRIPT_NAME, the_log)
    # digester.post_digest(SCRIPT_NAME, the_log)

    print(" ")

    exit_msg = "Script done. Updated data is available at " + \
        "https://docs.google.com/spreadsheets/d/" + \
        str(sheet_id) + "/edit?usp=sharing"

    print(exit_msg)
    log_it(SCRIPT_NAME, exit_msg)

    quit()
Exemple #28
0
def main():

    asf.setServer('Test')

    # Google sheet used for reporting changes.

    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'resources!A:Z')

    id_file = 'resource_replacements.csv'
    output_folder = 'output/resource_replacements'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    # Search/replace patterns
    the_search_pattern = 'NCC'
    the_replace_pattern = 'NNC'

    the_before_afters = []

    the_heads = ['repo', 'asid', 'before', 'after']

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API
        x = asf.getResource(an_obj[0], an_obj[1])

        # Save copy of existing object
        print('Saving data to ' + out_path + '....')

        f = open(out_path, "w+")
        f.write(x)
        f.close()

        x = json.loads(x)

        the_old_field_data = x['user_defined']['string_2']

        y = x

        y['user_defined']['string_2'] = re.sub(the_search_pattern,
                                               the_replace_pattern,
                                               x['user_defined']['string_2'])

        if y['user_defined']['string_2'] == the_old_field_data:
            the_new_field_data = "[no change]"
        else:
            the_new_field_data = y['user_defined']['string_2']

        the_before_afters.append([
            an_obj[0], an_obj[1], '{string_2} ' + the_old_field_data,
            '{string_2} ' + the_new_field_data
        ])

        # convert dict back to json for posting.
        z = json.dumps(y)

        # Post the fixed object back to API.
        post = asf.postResource(an_obj[0], an_obj[1], z)
        print(post)

    # Report changes to Google Sheet

    print('Writing before/after info to sheet...')
    the_report_sheet.clear()
    the_report_sheet.appendData(the_before_afters)