Beispiel #1
0
def main():

    asf.setServer('Prod')

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  #set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y-%m-%d"))

    sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='mtime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                print(row)
                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_sheet.appendData(the_modifieds)

    quit()
Beispiel #2
0
def main():
    # Main code goes here.

    asf.setServer("Prod")

    output_folder = "output/resource_remove_links"
    the_lookup_csv = "id_lookup_prod.csv"
    bibid_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-161-remove-links/acfa-161-remove-links.txt"

    # Read a list of bibids (csv)
    the_bibids = []
    with open(bibid_file) as ids:
        for row in csv.reader(ids):
            the_bibids.append(row[0])

    for b in the_bibids:

        try:
            repo, asid = asf.lookupByBibID(b, the_lookup_csv)
            print("Processing " + str(b) + "...")

            out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_old.json")
            out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_new.json")

            x = asf.getResource(repo, asid)

            # Save copy of existing object
            print("Saving data to " + out_path_old + "....")
            with open(out_path_old, "w+") as f:
                f.write(x)

            x_dict = json.loads(x)
            print(x_dict["ead_location"])
            if "ead_location" in x_dict:
                del x_dict["ead_location"]
            else:
                print("No URL to delete!")

            y = json.dumps(x_dict)
            # print(y)

            post = asf.postResource(repo, asid, y)
            print(post)

            # Save copy of new object
            print("Saving data to " + out_path_new + "....")

            with open(out_path_new, "w+") as f:
                f.write(y)

        except:
            print("Error: Could not process " + str(b))
            print(sys.exc_info())
            # raise

    quit()
def main():
    # Test functions here.

    from pprint import pprint

    server = 'Test'
    asf.setServer(server)

    # The resource to scan
    the_resource = (4, 6288)

    # A place to put output of saved json objects (optional)
    output_folder = 'output/replace_extrefs'

    # Retrieve all archival objects under a given resource
    x = asf.getResponse('/repositories/' + str(the_resource[0]) +
                        '/resources/' + str(the_resource[1]) +
                        '/ordered_records')
    y = json.loads(x)['uris']

    # Select only the ones that are items or files, and add to a list
    the_refs = [r['ref'] for r in y if r['level'] in ['item', 'file']]

    cnt = 0

    for a_ref in the_refs:
        ref_decomposed = a_ref.split('/')
        repo, asid = ref_decomposed[2], ref_decomposed[4]

        ref_json = asf.getArchivalObject(repo, asid)

        out_path = output_folder + '/' + str(repo) + '_' + str(asid) + '.json'

        data_old = ref_json

        # The regex substitution
        repl = re.subn(r'<extref\s+type=\\"simple\\"\s+href=',
                       r'<extref xlink:type=\"simple\" xlink:href=',
                       ref_json,
                       flags=re.DOTALL)

        if repl[1] > 0:  # [1] is the count of replacements from subn
            # there is a change
            # Save copy of existing object
            print('Saving data to ' + out_path + '....')

            with open(out_path, "w+") as f:
                f.write(data_old)

            data_new = repl[0]
            cnt += 1
            print('Posting ' + str(repo) + '_' + str(asid) + ' to ' + server)
            z = asf.postArchivalObject(repo, asid, data_new)
            print(z)
            print(' ')

    print('Total replacements: ' + str(cnt))
def main():
    # SERVER = "Test"  # test
    SERVER = "Prod"
    asf.setServer(SERVER)

    LOOKUP = '/Users/dwh2128/Documents/git/dcps-utils/archivesspace/as_reports/id_lookup_prod.csv'

    sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI'
    read_sheet = dataSheet(sheet_id, 'TEST!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'Output!A:Z')

    the_data = read_sheet.getData()
    the_data.pop(0)

    # print(the_refs)

    the_output = []
    for r in the_data:
        bibid = r[0]
        repo = r[1]
        ref = r[2]
        extref_old = r[3]
        extref_new = r[5]
        the_res = json.loads(asf.getResourceByBibID(bibid, LOOKUP))
        # pprint(the_res)

        asid = the_res['uri'].split('/')[4]

        print("repo: " + str(repo) + "; asid: " + str(asid))

        the_notes = json.dumps(the_res['notes'])
        # print(the_notes)
        print(" ")

        the_new_notes = replace_notes(
            the_notes, [
                # fix problem of leading space in href
                {'find': 'xlink:href=\\" http',
                 'replace': 'xlink:href=\\"http'},
                # replace old url with new one
                {'find': extref_old,
                 'replace': extref_new}])

        # print(the_new_notes)

        the_res['notes'] = json.loads(the_new_notes)

        x = asf.postResource(repo, asid, json.dumps(the_res))
        out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)]
        print(out_row)
        the_output.append(out_row)

    # # write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()
def main():

    sheet_id = "1GEeNpKBhfjOCJGx1zJfi6XgZ4OWhGhzWsOHRT9DkmpY"

    # list_sheet = dataSheet(sheet_id, 'Test!A:Z')  # test
    list_sheet = dataSheet(sheet_id, "batch!A:Z")
    report_sheet = dataSheet(sheet_id, "output!A:Z")

    the_uris = list_sheet.getDataColumns()[0]

    output_data = []
    for uri in the_uris:
        asid = uri.split("/")[3]
        x = fix_agent(asid, "families")
        pprint(x["display_name"])
        res = asf.postAgent(asid, json.dumps(x), agent_type="families")
        print(res)
        row = [SERVER, uri, str(res)]
        output_data.append(row)

    print(output_data)

    report_sheet.appendData(output_data)

    quit()
Beispiel #6
0
def main():

    asf.setServer('Prod')

    # the_repos=[2,3,4,5]
    the_repos=[2]
    the_fields = ['id','title','identifier','create_time','system_mtime','last_modified_by','json']

    the_sheet=dataSheet('198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY','unpublished!A:Z')


    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))
            
        x = getUnpublished(r,filter='resources',fields=the_fields)
        # print(x)

        for a in x:
            row = [ a[v] for v in the_fields ]
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except: 
                call_no = ''
            repo_id = int(str(row[0].split('/')[-3]).rstrip()) # get the repo from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip()) # get the asid from the uri string.
            row.pop(0)
            row.insert(0,asid), row.insert(0,repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0,repo)
            the_unpublished.append(row)
            print(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    print('Total unpublished: ' + str(len(the_unpublished)))

    # the_sheet.clear()
    # the_sheet.appendData([the_fields])
    # the_sheet.appendData(the_unpublished)


    quit()
Beispiel #7
0
def main():
    SERVER = "Prod"  # test
    # SERVER = "Prod"
    asf.setServer(SERVER)

    sheet_id = '1Jbdhda0HbmHKJ7COOJ3CBzdMwpSeIbYHyXzr179ETpI'
    read_sheet = dataSheet(sheet_id, 'TEST!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'Output!A:Z')

    the_data = read_sheet.getData()
    the_data.pop(0)

    # print(the_refs)

    the_output = []
    for r in the_data:
        repo = r[1]
        ref = r[2]
        extref_old = r[3]
        extref_new = r[5]
        the_ao = json.loads(asf.getArchivalObjectByRef(repo, ref))
        asid = the_ao['uri'].split('/')[4]

        print("asid: " + str(asid))

        the_notes = json.dumps(the_ao['notes'])

        # fix problem of leading space in href
        the_new_notes = the_notes.replace('xlink:href=\\" http',
                                          'xlink:href=\\"http')
        # replace old url with new one
        the_new_notes = the_new_notes.replace(extref_old, extref_new)

        print(the_new_notes)
        the_ao['notes'] = json.loads(the_new_notes)

        pprint(the_ao)

        x = asf.postArchivalObject(repo, asid, json.dumps(the_ao))
        out_row = [SERVER, repo, asid, ref, extref_old, extref_new, str(x)]
        print(out_row)
        the_output.append(out_row)

    # write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()
Beispiel #8
0
def main():

    server = 'Prod'
    asf.setServer(server)

    enum_num = 14  # extent_extent_type enumeration
    extent_data = asf.getEnumeration(enum_num)

    extent_usage_csv = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-111-extents-cleanup/extent-values-prod3.tsv'

    output_folder = 'output/enumerations'

    # Paths for reporting before/after data
    out_path_old = output_folder + '/' + str(enum_num) + 'PROD_old.json'
    out_path_new = output_folder + '/' + str(enum_num) + 'PROD_new.json'

    # Save copy of existing object
    print('Saving data to ' + out_path_old + '....')
    with open(out_path_old, "w+") as f:
        f.write(extent_data)

    # Load list from csv
    csv.register_dialect('my_dialect', delimiter='\t', quoting=csv.QUOTE_NONE)
    data = []
    with open(extent_usage_csv) as the_csv_data:
        for row in csv.reader(the_csv_data, 'my_dialect'):
            data.append(row)

    # A list of ids of extent values to remove
    unused_extents = [x[0] for x in data if x[2] == 'Not used.']

    for e in unused_extents:
        print('suppressing ' + str(e))
        # mode='suppress' to suppress, mode='unsuppress' to unsuppress
        post = asf.suppressEnumerationValue(e, mode='suppress')
        print(post)

    extent_data_new = asf.getEnumeration(enum_num)

    # Save updated object
    print('Saving data to ' + out_path_new + '....')
    with open(out_path_new, "w+") as f:
        f.write(extent_data_new)
Beispiel #9
0
def main():

    # set to Prod | Dev | Test
    asf.setServer('Prod')

    bibid_file = "ead_bibids_20190520.txt"
    lookup_file = "id_lookup_prod_20190522.csv"
    outfile_loc = "ead_as_qc_reports/ead_as_qc_xml_PROD1"

    with open(bibid_file) as f:
        the_bibids = [line.rstrip('\n') for line in f]

    the_errors = []
    the_processed = []

    for a_bibid in the_bibids:
        print('Processing bibid: ' + a_bibid)
        if a_bibid:
            try:
                the_lookup = asf.lookupByBibID(a_bibid, lookup_file)
                the_repo = the_lookup[0]
                the_asid = the_lookup[1]
                the_processed.append(a_bibid)
            except:
                # Can't find in lookup
                the_repo = 0
                the_asid = 0
                the_errors.append(a_bibid)

        if (a_bibid and the_asid != 0):
            the_ead = asf.getEAD(the_repo, the_asid)

            the_filepath = outfile_loc + '/' + a_bibid + '_ead.xml'

            with open(the_filepath, "w") as myfile:
                myfile.write(the_ead)

    # Report results
    print('Processed ' + str(len(the_processed)) + ' records.')
    if len(the_errors) > 0:
        print('*** Warning: ' + str(len(the_errors)) +
              ' errors. Could not process id ' + ', '.join(the_errors) +
              ' ***')
def fix_agent(asid, agent_type):
    x = json.loads(asf.getAgent(asid, agent_type=agent_type))

    for name in x["names"]:
        print(name)
        if name["is_display_name"] == True:
            name["source"] = "local"
            name["rules"] = "dacs"

    x["display_name"]["source"] = "local"
    x["display_name"]["rules"] = "dacs"

    return x
Beispiel #11
0
def main():
    # Main code goes here.

    asf.setServer("Prod")

    lookup_csv = "id_lookup_prod.csv"
    id_file = "/Users/dwh2128/Documents/ACFA/TEST/ACFA-226-oclc/035s_20200915.txt"

    # Read a list of bibids and oclc strings
    the_data = []
    with open(id_file) as ids:
        for row in csv.reader(ids, delimiter="|"):
            the_data.append([row[0], row[1], row[2]])

    for a_row in the_data:
        bibid = a_row[0]
        print(bibid)
        str_2 = a_row[1]
        str_3 = a_row[2]
        try:
            repo, asid = asf.lookupByBibID(bibid, lookup_csv)

            x = asf.getResource(repo, asid)
            y = json.loads(x)

            user_defnd = y["user_defined"] if "user_defined" in y else {}
            user_defnd["string_2"] = str_2
            user_defnd["string_3"] = str_3

            print(user_defnd)

            y["user_defined"] = user_defnd

            z = json.dumps(y)
            post = asf.postResource(repo, asid, z)
            print(post)

        except Exception as e:
            print(e + ": Could not lookup " + str(bibid))
Beispiel #12
0
def main():
    # SERVER = "Test" # test
    SERVER = "Prod"
    asf.setServer(SERVER)

    sheet_id = '1OABHEJF1jqA1vlbW5yTENry5W7YqKlag5nJDJ9ouCzg'
    # read_sheet = dataSheet(sheet_id, 'Test!A:Z')  # Test
    read_sheet = dataSheet(sheet_id, 'Prod!A:Z')  # Test
    write_sheet = dataSheet(sheet_id, 'output!A:Z')

    the_refs = read_sheet.getDataColumns()[0]
    # print(the_refs)

    the_output = []
    for r in the_refs:
        the_ao = json.loads(asf.getArchivalObjectByRef(2, r))
        asid = the_ao['uri'].split('/')[4]
        old_date = str(the_ao['dates'][0]['begin'])
        new_ao = fix_begin_date(2, the_ao)
        new_date = str(new_ao['dates'][0]['begin'])
        print("asid: " + str(asid))
        x = asf.postArchivalObject(2, asid, json.dumps(new_ao))
        out_row = [SERVER, r, asid, old_date, new_date, str(x)]
        # print(out_row)
        the_output.append(out_row)

    write_sheet.clear()
    write_sheet.appendData(the_output)
    quit()

    x = fix_begin_date(2, 'b2ec9ce511e4212ebb145fb909ca85bd')
    print(x)

    pprint(
        json.loads(
            asf.getArchivalObjectByRef(2, 'b2ec9ce511e4212ebb145fb909ca85bd')))
    quit()
Beispiel #13
0
def add_authority(server, asid, uri, source=None):
    # function to (1) query subject and determine if it already has
    # an authority uri, (2) if not, add in the provided URI,
    # and (3) return a response for reporting.
    subj = asf.getSubject(asid)
    if 'authority_id' in subj:
        print('*** Subject ' + str(asid) + ' already has authority: ' +
              subj['authority_id'] + ' .... Skiping....')
        return [server, asid, subj['authority_id'], subj['source'], 'Y']
    else:
        subj['authority_id'] = uri
        if source is None:
            source = subj['source']
            # If a new source is provided, add it in as well.
        else:
            subj['source'] = source
        try:
            resp = asf.postSubject(asid, json.dumps(subj))
        except json.JSONDecodeError as e:
            resp = 'JSON ERROR: + str(asid) + :: ' + str(e)
        except Exception as e:
            resp = 'ERROR: + str(asid) + :: ' + str(e)
        print(resp)
        return [server, asid, uri, str(source), '', str(resp)]
Beispiel #14
0
def get_agent_data(name, endpoint, pickle_path):
    print("Getting agents: " + name)
    # out_path = os.path.join(my_path, "output/agents_" + i["name"] + ".pickle")
    # out_path = os.path.join(out_folder, "agents_" + i["name"] + ".pickle")
    # Get a list of agent ids from API
    agents_list = json.loads(asf.getResponse(endpoint + "?all_ids=true"))

    agent_cnt_str = "Number of agents (" + \
        name + "): " + str(len(agents_list))
    print(agent_cnt_str)
    log_it(SCRIPT_NAME, agent_cnt_str)

    agent_data = []

    # Loop through agent ids and get full record from API.
    for cnt, agent in enumerate(agents_list):
        # print("COUNT: " + str(cnt))
        # print("Agent # " + str(agent))
        x = asf.getResponse(endpoint + "/" + str(agent))
        agent_data.append(json.loads(x))

    # Save data as pickle
    util.pickle_it(agent_data, pickle_path)
    return agent_data
Beispiel #15
0
def harvestBatchEAD(ids_file, lookup_file, out_folder):
    bibidFile = ids_file
    lookupFile = lookup_file
    outFolder = out_folder

    with open(bibidFile) as f:
        the_bibids = [line.rstrip('\n') for line in f]

    the_errors = []
    the_processed = []

    for a_bibid in the_bibids:
        print('Processing bibid: ' + a_bibid)
        if a_bibid:
            try:
                the_lookup = asf.lookupByBibID(a_bibid, lookupFile)
                the_repo = the_lookup[0]
                the_asid = the_lookup[1]
                the_processed.append(a_bibid)
            except:
                # Can't find in lookup
                the_repo = 0
                the_asid = 0
                the_errors.append(a_bibid)

        # print(the_repo)
        # print(the_asid)

        if (a_bibid and the_asid != 0):
            the_ead = getSingleEAD(the_repo, the_asid)

            the_filepath = outFolder + '/' + a_bibid + '_ead.xml'
            with open(the_filepath, "w") as myfile:
                myfile.write(the_ead)

    # Report results
    print('Processed ' + str(len(the_processed)) + ' records.')
    if len(the_errors) > 0:
        print('*** Warning: ' + str(len(the_errors)) +
              ' errors. Could not process id ' + ', '.join(the_errors) +
              ' ***')
def main():
    # Main code goes here.

    my_name = __file__

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    asf.setServer("Prod")

    the_sheet = dataSheet("1UQm7ffd1Kq4zqlzHZajd9YkwW1_nmOJFS1W7nI-c_Vk",
                          "new-batch!A:Z")
    output_folder = os.path.join(my_path, "output/resource_collecting_area")

    the_rows = the_sheet.getData()
    the_new_rows = []

    the_heads = the_rows.pop(0)

    the_new_rows.append(the_heads)

    coll_area_index = 8  # the column of collecting area

    for a_row in the_rows:
        the_new_row = a_row
        # print(a_row)
        coll = ""
        repo, asid = a_row[0], a_row[1]
        if len(a_row) >= coll_area_index:
            # if there is a collecting location to add
            coll = a_row[coll_area_index]

            the_resource = asf.getResource(repo, asid)

            out_path_old = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_old.json")
            out_path_new = (output_folder + "/" + str(repo) + "_" + str(asid) +
                            "_new.json")

            # Save copy of existing object
            print("Saving data to " + out_path_old + "....")
            with open(out_path_old, "w+") as f:
                f.write(the_resource)

            the_data = json.loads(the_resource)

            fix = False
            if "user_defined" in the_data:
                the_user_defined = the_data["user_defined"]
                if "enum_4" in the_user_defined:
                    print("Already has enum_4! Skipping.")
                else:
                    fix = True
                    the_user_defined["enum_4"] = coll
                    the_data["user_defined"] = the_user_defined
                    the_new_resource = json.dumps(the_data)

                    # Save copy of new object
                    print("Saving data to " + out_path_new + "....")
                    with open(out_path_new, "w+") as f:
                        f.write(the_new_resource)

                if fix == True:

                    try:
                        post = "[NONE]"
                        post = asf.postResource(repo, asid, the_new_resource)
                        print(post)
                    except:
                        print("Error: There was a problem posting resource " +
                              str(repo) + ":" + str(asid) + "!")

                    the_new_row.append(coll)
            else:
                print("ERROR: No user_defined data in " + str(repo) + ":" +
                      str(asid))

        the_new_rows.append(the_new_row)

    the_sheet.clear()
    the_sheet.appendData(the_new_rows)

    # print(the_new_rows)

    quit()
Beispiel #17
0
script_name = os.path.basename(my_name)

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

sheet_id = "1pZk2tPMuZDOd1veOBSJNRk2fprA6p3Qb3WKZDtZay88"
the_sheet = dataSheet(sheet_id, "subjects!A:Z")
# the_sheet = dataSheet(sheet_id, "test!A:Z") # test

now1 = datetime.datetime.now()
start_time = str(now1)
end_time = ""  # set later

# First get the subject records from API (this can take a long time!)

asf.setServer("Prod")  # AS instance: Prod | Dev | Test

# out_path = os.path.join(my_path, "output/subjects.pickle")
out_path = "/cul/cul0/ldpd/archivesspace/subjects/subjects.pickle"

# uncomment to do the full download.
the_subjects = asf.getSubjects()
util.pickle_it(the_subjects, out_path)

# Report the saved data to Google Sheet

# List of fields to extract, expressed as dpaths.
the_fields = [
    ["uri", "uri"],
    ["title", "title"],
    ["source", "source"],
Beispiel #18
0
def main():

    # Set to Test | Dev | Prod
    asf.setServer('Prod')

    the_report_sheet = dataSheet(
        '1wNO0t2j5G9U0hUmb7E-jLd4T5skTs1aRxN7HrlyZwEI', 'daos-publish!A:Z')

    # Set value to switch to, publish (True) or unpublish (False)
    publish_value = True

    # id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-mitchell.csv'
    id_file = '/Users/dwh2128/Documents/ACFA/TEST/ACFA-162/acfa-162-kay.csv'
    output_folder = 'output/daos-publish'

    # Read a list of repo and object ids (csv)
    the_ids = []
    ids = open(id_file)
    for row in csv.reader(ids):
        the_ids.append([row[0], row[1]])
    ids.close()

    the_before_afters = []

    the_heads = ['repo', 'asid', 'uid', 'title', 'before', 'after']

    the_before_afters.append(the_heads)

    for an_obj in the_ids:

        out_path = output_folder + '/' + an_obj[0] + '_' + an_obj[
            1] + '_old.json'

        # read from API

        # try:
        x = asf.getDigitalObjectFromParent(an_obj[0], an_obj[1])

        # Save copy of existing object
        print('Saving data to ' + out_path + '....')

        f = open(out_path, "w+")
        f.write(x)
        f.close()

        x = json.loads(x)

        # the_old_field_data = x['file_versions'][0]['file_uri']
        the_old_field_data = x['publish']

        asid = str(
            x['uri'].split('/')[-1])  # get the asid from the uri string.

        title = x['title']

        repo = str(an_obj[0])

        y = x

        # Here set the desired value
        y['publish'] = publish_value

        if y['publish'] == the_old_field_data:
            the_new_field_data = "[no change]"
        else:
            the_new_field_data = y['publish']

        the_before_afters.append([
            an_obj[0], asid, an_obj[1], title, the_old_field_data,
            the_new_field_data
        ])

        # convert dict back to json for posting.
        z = json.dumps(y)

        # Post the fixed object back to API.
        # (Comment these out for testing.)
        if the_new_field_data != "[no change]":
            resp = asf.postDigitalObject(repo, asid, z)
            print(resp)
        else:
            print('No update: skipping record.')

        # except:
        #     print('Could not retrieve record ' + str(an_obj[1]))

    # Report changes to Google Sheet
    print('Writing before/after info to sheet...')
    the_report_sheet.clear()
    the_report_sheet.appendData(the_before_afters)

    print("Done!")

    quit()
Beispiel #19
0
def main():

    asf.setServer('Prod')  # AS instance: Prod | Dev | Test

    mode = 'Prod'  # Prod or Test

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ''  # set later
    # today_str = str(date.today().strftime("%Y%m%d"))
    yest_str = str((date.today() - timedelta(days=1)).strftime("%Y%m%d"))

    ########################
    ### PROCESS OAI DATA ###
    ########################

    # Set path to Saxon processor
    # saxon_path = os.path.join(my_path, "../../resources/saxon-9.8.0.12-he.jar")

    # XSLT file to generate report
    marc_xslt_file = os.path.join(my_path, '../xslt/marcDataExtract.xsl')

    if mode == 'Prod':
        # OAI XML file to use as source
        # source_dir='/cul/cul0/lito/libsys/voyager/prod/data/loads/AS_harvest'
        source_dir = '/cul/cul0/ldpd/archivesspace/oai'
        sheet_id = '198ON5qZ3MYBWPbSAopWkGE6hcUD8P-KMkWkq2qRooOY'
        oai_file = source_dir + '/' + yest_str + '.asAllRaw.xml'

    else:  # TEST
        yest_str = "20190915"
        # OAI XML file to use as source
        source_dir = '/Users/dwh2128/Documents/ACFA/exist-local/backups/cached_eads/cached_eads_20190912'  # local test
        sheet_id = '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ'
        oai_file = yest_str + '.asAllRaw.xml'
    the_sheets = {
        'oai': dataSheet(sheet_id, 'oai!A:Z'),
        'oai_last': dataSheet(sheet_id, 'oai_last!A:Z'),
        'log': dataSheet(sheet_id, 'log!A:Z')
    }

    the_outpath = os.path.join(my_path,
                               'output/' + yest_str + '.marc_reporter_out.xml')

    print(' ')

    # Copy oai current data to oai_last sheet for diff
    the_old_data = the_sheets['oai'].getData()
    the_sheets['oai_last'].clear()
    the_sheets['oai_last'].appendData(the_old_data)
    # Process OAI MARC and output to CSV
    util.saxon_process(oai_file, marc_xslt_file, the_outpath)

    # clear data from "new" sheet
    the_sheets['oai'].clear()

    # Send result csv to Google Sheet.
    y = the_sheets['oai'].importCSV(the_outpath, delim='|')

    print(' ')

    ########################
    ### PROCESS UNPUBLISHED ###
    ########################

    print('Finding unpublished records...')

    the_repos = [2, 3, 4, 5]
    the_fields = [
        'id', 'title', 'identifier', 'create_time', 'system_mtime',
        'last_modified_by', 'json'
    ]
    the_heads = [
        'REPO', 'REPO_ID', 'RESOURCE_ID', 'TITLE', 'BIBID', 'CREATE_TIME',
        'SYSTEM_MTIME', 'LAST_MODIFIED_BY'
    ]

    unpubs_sheet = dataSheet(sheet_id, 'unpublished!A:Z')

    the_unpublished = []

    for r in the_repos:
        print('searching repo ' + str(r))

        x = asf.getUnpublished(r, filter='resources', fields=the_fields)
        # print(x)

        for a in x:
            row = [a[v] for v in the_fields]
            # print(row)
            my_json = json.loads(row.pop(6))
            try:
                call_no = my_json['user_defined']['string_1']
            except:
                call_no = ''
            # get the repo from the uri string.
            repo_id = int(str(row[0].split('/')[-3]).rstrip())
            # get the asid from the uri string.
            asid = int(str(row[0].split('/')[-1]).rstrip())
            row.pop(0)
            row.insert(0, asid), row.insert(0, repo_id)
            if 'UA' in call_no:
                repo = 'nnc-ua'
            else:
                repo = get_repo(repo_id)
            row.insert(0, repo)
            the_unpublished.append(row)
        print('Repo ' + str(r) + ': ' + str(len(x)))

    # print('Total unpublished: ' + str(len(the_unpublished)))
    msg = 'Total unpublished: ' + str(len(the_unpublished))
    print(msg)
    digester.post_digest(script_name, msg)  # Test

    unpubs_sheet.clear()
    unpubs_sheet.appendData([the_heads])
    unpubs_sheet.appendData(the_unpublished)

    ########################
    ### GET NEWLY CREATED ###
    ########################

    data_data = [{
        'range': 'resource-changes!A:Z',
        'filter': 'resources'
    }, {
        'range': 'accession-changes!A:Z',
        'filter': 'accessions'
    }]

    for d in data_data:

        print('processing ' + d['filter'])

        the_delta_sheet = dataSheet(sheet_id, d['range'])

        the_date = yest_str
        # the_date = '2019-08-27'
        the_repos = [2, 3, 4, 5]
        the_fields = [
            'id', 'title', 'identifier', 'create_time', 'system_mtime',
            'last_modified_by', 'publish'
        ]

        the_heads = [
            'repo', 'asid', 'title', 'identifier', 'create_time',
            'system_mtime', 'last_modified_by', 'publish'
        ]

        the_modifieds = []

        for r in the_repos:

            print('searching repo ' + str(r))

            x = asf.getByDate(r,
                              the_date,
                              date_type='ctime',
                              comparator='equal',
                              filter=d['filter'],
                              fields=the_fields)
            for a in x:
                row = [a[v] for v in the_fields]
                # print(row)
                # get the repo from the uri string.
                repo = str(row[0].split('/')[-3]).rstrip()
                # get the asid from the uri string.
                asid = str(row[0].split('/')[-1]).rstrip()
                row.pop(0)
                row.insert(0, asid), row.insert(0, repo)

                the_modifieds.append(row)
                # print(list(a.values()))
                # the_modifieds.append(list(a.values()))
            print('Repo ' + str(r) + ': ' + str(len(x)))

        print('Total ' + d['filter'] + ': ' + str(len(the_modifieds)))

        digester.post_digest(script_name, 'New ' + d['filter'] + ': ' +
                             str(len(the_modifieds)))  # Test
        # the_sheet.clear()

        # the_sheet.appendData([the_fields])
        the_delta_sheet.appendData(the_modifieds)

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = 'Data imported by ' + my_name + '. Start: ' + start_time + \
        '. Finished: ' + end_time + ' (duration: ' + my_duration + ').'

    the_sheets['log'].appendData([[the_log]])

    print(' ')

    print(the_log)

    digester.post_digest(script_name, the_log)  # Test

    print(' ')

    print('Script done. Updated data is available at ' + the_sheets['oai'].url)
Beispiel #20
0
def main():

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    if DEBUG is True:
        sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8"  # test
    else:
        sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE"

    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ""  # set later

    # First get the agent records from API (this can take a long time!)

    asf.setServer("Prod")  # AS instance: Prod | Dev | Test

    if DEBUG is True:
        out_folder = "/cul/cul0/ldpd/archivesspace/test/agents"
    else:
        out_folder = "/cul/cul0/ldpd/archivesspace/agents"

    family_agents_file = os.path.join(out_folder, "agents_families.pickle")
    corp_agents_file = os.path.join(out_folder, "agents_corporate.pickle")
    persons_agents_file = os.path.join(out_folder, "agents_persons.pickle")

    the_info = [
        {
            "name": "families",
            "endpoint": "/agents/families",
            "sheet": dataSheet(sheet_id, "families!A:Z"),
            "pickle": family_agents_file
        },
        {
            "name": "corporate",
            "endpoint": "/agents/corporate_entities",
            "sheet": dataSheet(sheet_id, "corporate!A:Z"),
            "pickle": corp_agents_file
        },
        {
            "name": "persons",
            "endpoint": "/agents/people",
            "sheet": dataSheet(sheet_id, "persons!A:Z"),
            "pickle": persons_agents_file
        },
    ]

    # List of fields to extract, expressed as dpaths.
    the_fields = [
        ["uri", "uri"],
        ["title", "title"],
        ["source", "names/0/source"],
        ["authority_id", "names/0/authority_id"],
        ["is_linked_to_published_record", "is_linked_to_published_record"],
        ["publish", "publish"],
        ["last_modified_by", "last_modified_by"],
        ["last_modified", "system_mtime"],
    ]

    the_record_cnts = {}

    if DEBUG is True:
        print("*** (DEBUG MODE) ***")

    for i in the_info:
        print("Getting agents: " + i["name"])
        agent_data = get_agent_data(i["name"], i["endpoint"], i["pickle"])

        print(" ")

        # Report the saved data to Google Sheet

        the_sheet = i["sheet"]

        the_heads = [x[0] for x in the_fields]
        the_output = [the_heads]

        the_record_cnts[i["name"]] = str(len(agent_data))

        for agent in agent_data:
            the_row = []
            # Use dpath to extract values from dict and compose into rows.
            for af in the_fields:
                try:
                    d = str(dpath.util.get(agent, af[1]))
                except:
                    d = ""
                the_row.append(d)
            # print(the_row)
            the_output.append(the_row)

        the_sheet.clear()
        save = the_sheet.appendData(the_output)
        print(save)

    # Generate log

    print(the_record_cnts)
    print(" ".join(the_record_cnts))

    cnt_str = "".join(k + "=" + v + ". " for k, v in the_record_cnts.items())

    # print(cnt_str)

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("Data imported by " + MY_NAME + ". " + cnt_str + " Start: " +
               start_time + ". Finished: " + end_time + " (duration: " +
               my_duration + ").")

    log_range = "log!A:A"
    log_sheet = dataSheet(sheet_id, log_range)

    log_sheet.appendData([[the_log]])

    print(" ")

    print(the_log)
    log_it(SCRIPT_NAME, the_log)
    # digester.post_digest(SCRIPT_NAME, the_log)

    print(" ")

    exit_msg = "Script done. Updated data is available at " + \
        "https://docs.google.com/spreadsheets/d/" + \
        str(sheet_id) + "/edit?usp=sharing"

    print(exit_msg)
    log_it(SCRIPT_NAME, exit_msg)

    quit()
Beispiel #21
0
# Automated reporting of ArchivesSpace accessions info.

import ASFunctions as asf
import json
from pprint import pprint
from sheetFeeder import dataSheet
from operator import itemgetter
import datetime
import re
import os.path
import dateutil.parser
import digester  # for generating composite digest of report info.

# set Prod | Dev | Test
target_server = 'Prod'  # Prod | Dev | Test
asf.setServer(target_server)

DEBUG = False
# mode = 'Prod'  # Prod or Test

MY_NAME = __file__
SCRIPT_NAME = os.path.basename(MY_NAME)

# This makes sure the script can be run from any working directory and still find related files.
MY_PATH = os.path.dirname(__file__)

# File to use to lookup bibids
LOOKUP_CSV = os.path.join(MY_PATH, "id_lookup_prod.csv")


def main():
Beispiel #22
0
# Script to add authorities or make other changes to subjects. See ACFA-287.

import ASFunctions as asf
import json
from pprint import pprint
from sheetFeeder import dataSheet
import os.path


SERVER = 'Prod'
asf.setServer(SERVER)

my_name = __file__


# pprint(asf.getSubject(11453))
# quit()

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

sheet_id = '1b-dFdOaWD7AEqzhK0uuGXkonum6wX8Zcriq8-G4l33Q'

# list_sheet = dataSheet(sheet_id, 'Test!A:Z')  # test
list_sheet = dataSheet(sheet_id, 'batch!A:Z')
report_sheet = dataSheet(sheet_id, 'output!A:Z')


def add_authority(server, asid, uri, source=None):
    # function to (1) query subject and determine if it already has
    # an authority uri, (2) if not, add in the provided URI,
Beispiel #23
0
def main():
    # Main code goes here.

    asf.setServer("Prod")

    on_site = False
    # set to True to get on-site note, False to get off-site note. See the_access_note var below.

    output_folder = "output/resource_on-site_access"

    lookup_csv = "id_lookup_prod.csv"

    # bibid_file = (
    #     "/Users/dwh2128/Documents/ACFA/TEST/ACFA-224-onsite-notes/acfa-224-list_3.csv"
    # )
    bibid_file = (
        "/Users/dwh2128/Documents/ACFA/TEST/ACFA-243-off-site/acfa-243_off-site.csv"
    )

    # Read a list of bibids (csv)
    the_bibids = []
    with open(bibid_file) as ids:
        for row in csv.reader(ids):
            the_bibids.append(row[0])

    if on_site == True:
        the_access_note = {
            "jsonmodel_type": "note_multipart",
            "label": "Restrictions on Access",
            "type": "accessrestrict",
            "rights_restriction": {"local_access_restriction_type": []},
            "subnotes": [
                {
                    "jsonmodel_type": "note_text",
                    "content": "This collection is located on-site.",
                    "publish": True,
                }
            ],
            "publish": True,
        }
    else:
        the_access_note = {
            "jsonmodel_type": "note_multipart",
            "label": "Restrictions on Access",
            "type": "accessrestrict",
            "rights_restriction": {"local_access_restriction_type": []},
            "subnotes": [
                {
                    "jsonmodel_type": "note_text",
                    "content": "This collection is located off-site. You will need to request this material at least three business days in advance to use the collection in the Rare Book and Manuscript Library reading room.",
                    "publish": True,
                }
            ],
            "publish": True,
        }

    for bib in the_bibids:

        try:
            repo, asid = asf.lookupByBibID(bib, lookup_csv)
        except:
            print("Error: No record found for " + str(bib) + ". Skipping...")
            continue

        out_path_old = output_folder + "/" + str(repo) + "_" + str(asid) + "_old.json"
        out_path_new = output_folder + "/" + str(repo) + "_" + str(asid) + "_new.json"

        the_resource = asf.getResource(repo, asid)

        # Save copy of existing object
        print("Saving data to " + out_path_old + "....")

        with open(out_path_old, "w+") as f:
            f.write(the_resource)

        the_data = json.loads(the_resource)

        # Test if there is already an access restriction note.
        has_note = False
        for a_note in the_data["notes"]:
            try:
                if a_note["type"] == "accessrestrict":
                    has_note = True
            except KeyError:
                print("Note has no type -- skipping.")

        if has_note == True:
            print(str(bib) + " - Warning: Already has access note.")
        # else:
        the_data["notes"].append(the_access_note)

        the_new_resource = json.dumps(the_data)

        # Save copy of new object
        print("Saving data to " + out_path_new + "....")

        with open(out_path_new, "w+") as f:
            f.write(the_new_resource)

        try:
            post = asf.postResource(repo, asid, the_new_resource)
            print(post)
        except:
            print(
                "Error: There was a problem posting resource "
                + str(repo)
                + ":"
                + str(asid)
                + "!"
            )

    quit()
Beispiel #24
0
logging.basicConfig(level=logging.ERROR)
# not doing anything with this yet...

# logging.debug('¥¥¥¥¥¥ This is a debug message')
# logging.info('¥¥¥¥¥¥ This is an info message')
# logging.warning('¥¥¥¥¥¥ This is a warning message')
# logging.error('¥¥¥¥¥¥ This is an error message')
# logging.critical('¥¥¥¥¥¥ This is a critical message')

my_name = __file__

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

asf.setServer('Prod')

print('THIS IS A TEST -- IGNORE!')

print(' ')


print('testing google sheet api...')

# The ID and range of a sample spreadsheet.
the_sheet = dataSheet(
    '1YzM1dinagfoTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ', 'Sheet1!A:Z')
# the_sheet = dataSheet('1YzM1oTUirAoA2hHBfnhSM1PsPt8TkwTT9KlgQ','Sheet1!A:Z')


print(the_sheet.getData())
Beispiel #25
0
# Script to get barcode and holding info from spreadsheet
# and add to top containers in ArchivesSpace via API. See ACFA-206.

import ASFunctions as asf
import json
from pprint import pprint
from sheetFeeder import dataSheet
import dcps_utils as util
import os.path
import csv
import datetime

asf.setServer('Prod')

my_name = __file__

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

# sheet_id = '1gUx1cPS8POLxqRblYIs1vlpr7yDGOyHmAJqpl6nMo4k'
sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0'  # test

# list_sheet = dataSheet(sheet_id, 'report!A:Z')
list_sheet = dataSheet(sheet_id, 'test!A:Z')  # test

the_data = list_sheet.getData()

the_heads = the_data.pop(0)

today = datetime.date.today().strftime("%Y-%m-%d")
Beispiel #26
0
def main():

    # set to True to use test sheet and test json folder location.
    debug = False

    asf.setServer("Prod")

    my_name = __file__
    script_name = os.path.basename(my_name)

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    now1 = datetime.now()
    start_time = str(now1)
    end_time = ""  # set later
    today_str = str((date.today()).strftime("%Y%m%d"))

    if debug:
        print("[Running script in debug mode...]")
        parent_folder = "/cul/cul0/ldpd/archivesspace/test/resources"  # test folder
        sheet_id = "1wFyLN_Ea7ExCZSMuksB8MTrS9DjsUkwsmaPBujL7x0U"  # test sheet
        the_repos = [4]  # to test
    else:
        parent_folder = "/cul/cul0/ldpd/archivesspace/resources"
        sheet_id = "1T3EpIZmnh3Gk-VAIGtvavTQUIpS7AluyKQ8-sJsS8vg"
        the_repos = [2, 3, 4, 5, 6]

    output_folder = parent_folder + "/" + today_str

    the_sheets = {
        "resources": dataSheet(sheet_id, "Resources!A:Z"),
        "cm": dataSheet(sheet_id, "Collection Management!A:Z"),
        "log": dataSheet(sheet_id, "log!A:Z"),
    }

    # Set number of chars to truncate the scope and bioghist notes.
    trunc_len = 400

    # List of fields to extract, expressed as dpaths.
    the_fields = [
        ["bibid", "/id_0"],
        ["title", "/title"],
        ["published", "/publish"],
        ["create_time", "/create_time"],
        ["system_mtime", "/system_mtime"],
        ["created_by", "/created_by"],
        ["last_modified_by", "/last_modified_by"],
        ["ead_location", "/ead_location"],
        ["ext_number", "/extents/0/number"],
        ["ext_portion", "/extents/0/portion"],
        ["ext_type", "/extents/0/extent_type"],
        # ["integer_1", "/user_defined/integer_1"],
        # ["integer_2", "/user_defined/integer_2"],
        # ["integer_3", "/user_defined/integer_3"],
        ["local call no.", "/user_defined/string_1"],
        ["other ctrl no. 1", "/user_defined/string_2"],
        ["other ctrl no. 2", "/user_defined/string_3"],
        ["other ctrl no. 3", "/user_defined/string_4"],
        # ["enum_1", "/user_defined/enum_1"],
        # ["enum_2", "/user_defined/enum_2"],
        ["description status", "/user_defined/enum_3"],
        ["collecting area", "/user_defined/enum_4"],
        ["level", "level"]
        # (Scope and bioghist notes are added in separately below.)
    ]

    # Get the collection management records for use in report.

    the_cms = []

    fields = [
        "id",
        "parent_id",
        "title",
        "system_mtime",
        "processing_priority",
        "processing_status",
    ]

    print(" ")
    print("*** Retrieve Collection Management Data ***")
    print(" ")

    for r in the_repos:
        print("Getting collection management records for repo: " + str(r) +
              "...")
        cm = asf.getCollectionManagements(r, filter="resource", fields=fields)
        for c in cm:
            row = [c[f] for f in fields]
            the_cms.append(row)

    # a data set of collection managment records to post to sheet below.
    the_cms.insert(0, fields)

    print(" ")
    print("*** Retrieve Resource Data ***")
    print(" ")

    # Get the list of resources for each repo and add to the_ids
    the_ids = []
    for r in the_repos:
        print("Getting ids for repo: " + str(r) + "...")
        asids = json.loads(
            asf.getResponse("/repositories/" + str(r) +
                            "/resources?all_ids=true"))

        print(str(len(asids)) + " records found in repo " + str(r) + ".")
        for i in asids:
            the_ids.append([r, i])

    # Construct the head row
    the_heads = [x[0] for x in the_fields]
    the_heads.insert(0, "asid")
    the_heads.insert(0, "repo")
    the_heads.append("scope note")
    the_heads.append("scopenote length")

    the_heads.append("bioghist note")
    the_heads.append("biognote length")

    the_output = [the_heads]

    # Fetch the resources from the ids
    print("Downloading resources...")

    if not os.path.exists(output_folder):
        print("Creating directory " + output_folder + "...")
        os.makedirs(output_folder)

    for repo, asid in the_ids:
        # print("Processsing " + str(repo) + ":" + str(asid) + "...")
        the_row = [repo, asid]
        res_json = asf.getResource(repo, asid)
        res_dict = json.loads(res_json)

        out_path = output_folder + "/" + str(repo) + "_" + str(asid) + ".json"

        # Write the JSON to file.
        with open(out_path, "w+") as f:
            f.write(res_json)

        # Use dpath to extract values from dict and compose into rows.
        for af in the_fields:
            try:
                d = str(dpath.util.get(res_dict, af[1]))
            except:
                d = ""
            the_row.append(d)

        # Process scope and bioghist notes

        the_notes = dpath.util.values(res_dict, "notes/*", afilter=None)

        the_scope_notes = []
        the_biog_notes = []

        for a_note in the_notes:
            try:
                if a_note["type"] == "scopecontent":
                    the_scope_notes.append(a_note)
            except:
                pass
            try:
                if a_note["type"] == "bioghist":
                    the_biog_notes.append(a_note)
            except:
                pass

        if the_scope_notes:
            # If there are scope notes, grab all the text and concatenate. Then get the total length in # chars.
            scope_note_texts = [
                s["subnotes"][0]["content"] for s in the_scope_notes
            ]
            the_scope_text = " ".join(scope_note_texts)
            scope_note_len = len(the_scope_text)

            scope_note_short = truncate_str(the_scope_text, length=trunc_len)
        else:
            scope_note_short = ""
            scope_note_len = 0

        if the_biog_notes:
            # If there are bioghist notes, grab all the text and concatenate. Then get the total length in # chars.
            biog_note_texts = [
                s["subnotes"][0]["content"] for s in the_biog_notes
            ]
            the_biog_text = " ".join(biog_note_texts)
            biog_note_len = len(the_biog_text)

            biog_note_short = truncate_str(the_biog_text, length=trunc_len)
        else:
            biog_note_short = ""
            biog_note_len = 0

        the_row.append(scope_note_short)
        the_row.append(str(scope_note_len))
        the_row.append(biog_note_short)
        the_row.append(str(biog_note_len))

        the_output.append(the_row)

    # Zip up the JSON files for storage.
    zip_out = make_archive(today_str,
                           "zip",
                           root_dir=parent_folder,
                           base_dir=today_str)

    print(zip_out)

    # Zip is saved in working dir; move to correct location.
    print("Saving zip file " + str(today_str) + ".zip to " + parent_folder)

    # Test if file already exists.
    if os.path.exists(parent_folder + "/" + str(today_str) + ".zip"):
        print("File " + parent_folder + "/" + str(today_str) +
              ".zip exists already. Replacing with new zip file...")

        os.remove(parent_folder + "/" + str(today_str) + ".zip")

    move(zip_out, parent_folder)

    # Remove the json folder once zip is in place.
    rmtree(parent_folder + "/" + today_str)

    util.file_cleanup(parent_folder, 60)

    # Write output to Google sheet.

    print(" ")
    print("*** Writing Data to Report ***")
    print(" ")

    the_sheets["cm"].clear()
    the_sheets["cm"].appendData(the_cms)
    digester.post_digest(
        script_name,
        "Total collection management records: " + str(len(the_cms) - 1))

    the_sheets["resources"].clear()
    the_sheets["resources"].appendData(the_output)
    digester.post_digest(
        script_name,
        "Total number of resource records: " + str(len(the_output) - 1))

    ########################
    ### FINISH UP ###
    ########################

    # Generate log string.
    now2 = datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    the_log = ("Data imported by " + my_name + ". Start: " + start_time +
               ". Finished: " + end_time + " (duration: " + my_duration + ").")

    the_sheets["log"].appendData([[the_log]])

    print(" ")

    print(the_log)

    print(" ")

    exit_msg = "Script done. Updated data is available at " + \
        the_sheets["resources"].url
    print(exit_msg)
    digester.post_digest(script_name, exit_msg)
Beispiel #27
0
def main():
    now1 = datetime.datetime.now()
    start_time = str(now1)
    end_time = ''  # set later
    # day_offset = now1.weekday() + 1 # Calculate the Sunday of current week
    day_offset = 7  # use past seven days, regardless of current day

    print('Script ' + MY_NAME + ' begun at ' + start_time + '. ')

    if not DEBUG:
        the_sheet_id = '1JA5bRSnYV80sx4m5SOFQ6QJ4u21SXvQeNdNbuRVCdds'
    else:
        the_sheet_id = '1e_TAK8eUsaHltBu9J5bNO1twThqt7_nE5olmz2pdCUw'  # test doc
        day_offset = 14  # use past 2 weeks for testing

    # Set date stamp of start of week (Sunday) to determine recently created accessions.
    begin_of_week = (now1 - datetime.timedelta(day_offset)).date()

    the_sheet_rbml = dataSheet(the_sheet_id, 'rbml!A:Z')
    the_sheet_avery = dataSheet(the_sheet_id, 'avery!A:Z')
    the_sheet_rbmlbooks = dataSheet(the_sheet_id, 'rbmlbooks!A:Z')

    # Location to save output
    if DEBUG is True:
        out_folder = "/cul/cul0/ldpd/archivesspace/test/accessions"
    else:
        out_folder = "/cul/cul0/ldpd/archivesspace/accessions"

    rbml_acc_file = os.path.join(out_folder, 'report_rbml_accessions.json')
    avery_acc_file = os.path.join(out_folder, 'report_avery_accessions.json')
    rbmlbooks_acc_file = os.path.join(out_folder,
                                      'report_rbmlbooks_accessions.json')

    print(' ')

    print('Starting accession report in ' +
          'https://docs.google.com/spreadsheets/d/' + str(the_sheet_id) +
          '/edit?usp=sharing')

    if not DEBUG:
        # Save the accessions as json files. In DEBUG mode, just use the files already saved.
        print('Saving Avery accession data to ' + avery_acc_file + '....')

        # Only fetch file if not in Debug mode
        with open(avery_acc_file, "w+") as f:
            try:
                x = asf.getAccessions(3)
                f.write(x)
            except:
                raise ValueError(
                    "There was an error in getting Avery accession data!")

            y = json.loads(x)
            if 'error' in y[0]:
                print(y[0]['error'])

        print('Saving RBML accession data to ' + rbml_acc_file + '....')

        with open(rbml_acc_file, "w+") as f:
            try:
                x = asf.getAccessions(2)
                f.write(x)
            except:
                raise ValueError(
                    "There was an error in getting RBML accession data!")

            y = json.loads(x)
            if 'error' in y[0]:
                print(y[0]['error'])

        print('Saving RBMLBOOKS accession data to ' + rbmlbooks_acc_file +
              '....')

        with open(rbmlbooks_acc_file, "w+") as f:
            try:
                x = asf.getAccessions(6)
                f.write(x)
            except:
                raise ValueError(
                    "There was an error in getting RBMLBOOKS accession data!")

            y = json.loads(x)
            if 'error' in y[0]:
                print(y[0]['error'])

    print(' ')

    # the_files = [
    #         [avery_acc_file, the_sheet_avery],
    #         [rbml_acc_file, the_sheet_rbml]
    #              ]

    the_recents = {}

    the_info = [{
        'repo_name': 'Avery',
        'repo_id': 3,
        'acc_file': avery_acc_file,
        'the_sheet': the_sheet_avery
    }, {
        'repo_name': 'RBML',
        'repo_id': 2,
        'acc_file': rbml_acc_file,
        'the_sheet': the_sheet_rbml
    }, {
        'repo_name': 'RBMLBOOKS',
        'repo_id': 6,
        'acc_file': rbmlbooks_acc_file,
        'the_sheet': the_sheet_rbmlbooks
    }]

    # The top-level elements to save from the JSON (each can be further processed below)
    the_keys = {
        "title": "title",
        "uri": "uri",
        "repository": "repository",
        "accession_date": "accession_date",
        "id_0": "id_0",
        "id_1": "id_1",
        "id_2": "id_2",
        "id_3": "id_3",
        "extents": "extents",
        "related_resources": "related_resources",
        "collection_management": "collection_management",
        "user_defined": "user_defined",
        "create_time": "create_time",
        "system_mtime": "system_mtime",
        "last_modified_by": "last_modified_by"
    }

    ext_dict = {
        "ext-number": "number",
        "ext-portion": "portion",
        "ext-type": "extent_type"
    }
    for f in the_info:

        the_file = f['acc_file']
        the_target = f['the_sheet']
        repo_name = f['repo_name']

        with open(the_file) as f:
            the_data = json.load(f)

        all_rows = []

        for an_accession in the_data:
            # acc_info : prelim dict for each accession. Do things to it.
            acc_info = {}
            for key, value in the_keys.items():
                try:
                    acc_info.update({key: an_accession[value]})
                except (IndexError, KeyError):
                    acc_info.update({key: ""})

            # Refine elements by extracting subelements, etc.

            # Handle collection_management
            cm = acc_info["collection_management"]
            cm_dict = {
                "processing_priority": "processing_priority",
                "processing_status": "processing_status"
            }
            for key, value in cm_dict.items():
                try:
                    acc_info[key] = cm[value]

                except (IndexError, KeyError, TypeError):
                    acc_info[key] = ''

            acc_info.pop("collection_management")

            # Parse resource id and get bibid
            res = acc_info["related_resources"]
            if len(res) > 0:
                res_url = res[0]["ref"]
                repo = res_url.split('/')[2]
                asid = res_url.split('/')[4]
                bibid = asf.lookupBibID(repo, asid, LOOKUP_CSV)
            else:
                bibid = ''
                asid = ''
            acc_info["resource_bibid"] = bibid
            acc_info["resource_asid"] = asid
            acc_info.pop("related_resources")

            # Parse BibID out of user_defined / integer_1
            try:
                usdef = acc_info["user_defined"]
                acc_info['integer_1'] = usdef['integer_1']
            except:
                acc_info['integer_1'] = ''
            acc_info.pop("user_defined")

            # Fix problem with leading "+" in id_3 (add apostrophe for display)
            acc_info["id_3"] = re.sub(r"^\+", "'+", acc_info["id_3"])

            # Handle repository
            repository = acc_info["repository"]
            if len(repository) > 0:
                repo_url = repository["ref"]
                repo = repo_url.split('/')[2]
            else:
                repo = ''
            acc_info["repo"] = repo
            acc_info.pop("repository")

            # Handle date
            acc_date = acc_info["accession_date"]
            yyyy = int(acc_date.split('-')[0])
            mm = int(acc_date.split('-')[1])
            dd = int(acc_date.split('-')[2])
            the_date = datetime.date(yyyy, mm, dd)
            # due to legacy import issue, some with unknown dates have malformed dates like 0002-01-23. Acknowledge their unknownness.
            if the_date.year < 1700:
                acc_info["accession_date"] = "0000-00-00"
                acc_info["year"] = ""
            else:
                acc_info["year"] = the_date.year

            # Fiscal year
            if the_date.year < 1700:
                acc_info["fiscal-year"] = ""
            else:
                if the_date.month > 6:
                    acc_info["fiscal-year"] = the_date.year + 1
                else:
                    acc_info["fiscal-year"] = the_date.year

            # Handle extents
            ext = acc_info["extents"]
            for key, value in ext_dict.items():
                try:
                    acc_info[key] = ext[0][value]
                except (IndexError, KeyError):
                    acc_info[key] = ''

            acc_info.pop("extents")

            # Clean up titles
            acc_info['title'] = str(acc_info['title']).strip()

            # Uncomment to list records in log.
            # print("processing: " + str(acc_info["uri"]).strip() + ' / ' + str(acc_info["title"]).strip() )

            all_rows.append(acc_info)

        processed_msg = 'Processed ' + \
            str(len(all_rows)) + ' records in ' + repo_name + '.'
        print(processed_msg)

        log_it(SCRIPT_NAME, processed_msg)

        # the_heads = list(all_rows[0].keys())

        # explicitly order the columns, as dict order is unpredictable.
        the_heads = [
            'title', 'uri', 'accession_date', 'id_0', 'id_1', 'id_2', 'id_3',
            'integer_1', 'resource_bibid', 'resource_asid', 'repo', 'year',
            'fiscal-year', 'ext-number', 'ext-portion', 'ext-type',
            'processing_priority', 'processing_status', 'create_time',
            'system_mtime', 'last_modified_by'
        ]

        the_output = []

        # Build row in order specified by the_heads
        for a_row in all_rows:
            # r = list(a_row.values())
            r = [a_row[h] for h in the_heads]
            the_output.append(r)
            # print(a_row)

        # sort by accession_date (the 2nd item in inner lists)
        the_output = sorted(the_output, key=itemgetter(2), reverse=True)

        # Get list of recents
        the_recents[repo_name] = []

        for i in the_output:
            # i[18] = the create date column
            i_date = dateutil.parser.isoparse(i[18]).date()

            if i_date > begin_of_week:

                the_recents[repo_name].append(i)

        # If there are recents, list them
        if the_recents[repo_name]:
            print(' ')
            recent_msg = str(len(the_recents[repo_name])) + \
                ' accessions recently added in ' + repo_name + ': '
            print(recent_msg)
            log_it(SCRIPT_NAME, recent_msg)
            print('-----------')
            for r in the_recents[repo_name]:
                print(r[0])
                print(r[1])
                print('Created ' + str(dateutil.parser.isoparse(r[18]).date()))
                print('Last edited by ' + r[20])
                print('-----------')
        else:
            print(' ')
            recent_msg = 'No recently created accessions in ' + repo_name
            print(recent_msg)
            log_it(SCRIPT_NAME, recent_msg)

            # print(the_recents[repo_name])

        the_output.insert(0, the_heads)

        print(' ')

        the_target.clear()

        print('Writing ' + repo_name + ' data to sheet ...')
        the_target.appendData(the_output)

        print(' ')

    # generate log and add to log tab, if exists.
    the_tabs = the_target.initTabs

    now2 = datetime.datetime.now()
    end_time = str(now2)
    my_duration = str(now2 - now1)

    if DEBUG is True:
        the_log = '[TEST] Data imported from ' + target_server + ' by ' + MY_NAME + '. Start: ' + \
            start_time + '. Finished: ' + end_time + \
            ' (duration: ' + my_duration + ').'
    else:
        the_log = 'Data imported from ' + target_server + ' by ' + MY_NAME + '. Start: ' + \
            start_time + '. Finished: ' + end_time + \
            ' (duration: ' + my_duration + ').'

    if 'log' in the_tabs:
        log_range = 'log!A:A'
        # today = datetime.datetime.today().strftime('%c')
        dataSheet(the_sheet_id, log_range).appendData([[the_log]])
    else:
        print('*** Warning: There is no log tab in this sheet. ***')

    print(' ')

    print(the_log)
    log_it(SCRIPT_NAME, the_log)

    print(' ')

    exit_msg = 'Script done. Updated data is available at ' + \
        'https://docs.google.com/spreadsheets/d/' + \
        str(the_sheet_id) + '/edit?usp=sharing'
    print(exit_msg)
    log_it(SCRIPT_NAME, exit_msg)
Beispiel #28
0
def main():

    my_name = __file__

    # This makes sure the script can be run from any working directory and still find related files.
    my_path = os.path.dirname(__file__)

    sheet_id = '13OakaS0KHtxcaV9HGWDP9Zfnz9TVJR_9zGUnKrb90jk'  # test
    # sheet_id = '1tYOXSDFlkbX_revB_ULvhmCdvKkyzpipBTkYqYXcM38'
    # sheet_id = '1e43qKYvqGQFOMxA70U59yPKPs18y-k3ohRNdU-qrTH0'  # test
    # sheet_id = '1OhgJ4g-SWbmnms4b3ppe_0rBT7hz9jfQp6P8mADcatk'  # batch template doc

    container_sheet = dataSheet(sheet_id, 'containers!A:Z')

    marc_sheet = dataSheet(sheet_id, 'marc!A:Z')

    # Get a list of bibids from the Marc tab.
    # the_bibids = marc_sheet.getDataColumns()[0]
    the_bibids = marc_sheet.getDataColumns()[1]
    the_bibids.pop(0)
    the_bibids = list(set(the_bibids))
    print(the_bibids)

    #### TOP CONTAINERS ####

    the_heads = [
        'bibid', 'resource', 'uri', 'type', 'display_string', 'concat'
    ]
    the_rows = [the_heads]

    lookup_csv = os.path.join(my_path, 'id_lookup_prod.csv')
    for abib in the_bibids:
        print(abib)
        # Get repo and asid from bibid
        repo, asid = asf.lookupByBibID(abib, lookup_csv)

        print('Getting top containers for ' + str(repo) + ':' + str(asid))

        the_query = '/repositories/' + \
            str(repo) + '/resources/' + str(asid) + '/top_containers'

        # list of top containers
        the_refs = json.loads(asf.getResponse(the_query))
        print(the_refs)
        cnt = 0
        for r in the_refs:
            cnt += 1
            print(cnt)
            try:
                tc = json.loads(asf.getResponse(r['ref']))
                # print(tc)

                try:
                    bibid = tc['collection'][0]['identifier']
                except:
                    bibid = ''
                try:
                    resource = tc['collection'][0]['ref']
                except:
                    resource = ''
                try:
                    uri = tc['uri']
                except:
                    uri = ''
                try:
                    type = tc['type']
                except:
                    type = ''
                try:
                    display_string = tc['display_string']
                except:
                    display_string = ''
                try:
                    concat_str = str(tc['display_string'] + ' (' +
                                     uri.split('/')[4]) + ')'

                except:
                    concat_str = 'x'

                a_row = [
                    bibid, resource, uri, type, display_string, concat_str
                ]
                # print(a_row)
                the_rows.append(a_row)
            except:
                print(r)

    # Write results to google sheet
    container_sheet.clear()
    z = container_sheet.appendData(the_rows)
    print(z)
Beispiel #29
0
tree = et.parse(id_xml)
root = tree.getroot()

the_recs = root.findall('record')

the_ids = []

for a_rec in the_recs:
    i = a_rec.xpath('identifier/text()')

    asid = str(
        i[0].split('/')[-1]).rstrip()  # get the asid from the uri string.
    repo = str(
        i[0].split('/')[-3]).rstrip()  # get the repo from the uri string.
    bibid = a_rec.xpath('bibid/text()')[0]

    the_ids.append([repo, asid, bibid])

for x in the_ids:

    the_ead = asf.getEAD(x[0], x[1])

    out_path = output_folder + '/' + str(x[2]) + '_out.xml'

    # Save copy of existing object
    print('Saving data to ' + out_path + '....')

    f = open(out_path, "w+")
    f.write(the_ead)
    f.close()
my_name = __file__
script_name = os.path.basename(my_name)

# This makes sure the script can be run from any working directory and still find related files.
my_path = os.path.dirname(__file__)

sheet_id = "1dTeMAK_cGWAUvrqvAiY2hGy4gJewrmWjnuIZu8NhWwE"
# sheet_id = "18uvn9wIABHVIdjlSRNXqnHUKB2aTvZgKO62e-UFNuO8"  # test

now1 = datetime.datetime.now()
start_time = str(now1)
end_time = ""  # set later

# First get the agent records from API (this can take a long time!)

asf.setServer("Prod")  # AS instance: Prod | Dev | Test

the_info = [
    {
        "name": "families",
        "endpoint": "/agents/families",
    },
    {
        "name": "corporate",
        "endpoint": "/agents/corporate_entities",
    },
    {
        "name": "persons",
        "endpoint": "/agents/people",
    },
]