def add_citation(collection, token, production=True): """Add in example citation text in the description field""" keys = dataset.keys(collection) for k in keys: record, err = dataset.read(collection, k) if err != "": print(err) exit() description = record["descriptions"] cite_exists = False for d in description: descr_text = d["description"] if descr_text.startswith("<br>Cite this record as:"): cite_exists = True if cite_exists == False: record_doi = record["identifier"]["identifier"] headers = {"Accept": "text/x-bibliography; style=apa"} citation_link = "https://doi.org/" citation = requests.get(citation_link + record_doi, headers=headers).text doi_url = "https://doi.org/" + record_doi if doi_url in citation: # Check that we have a citation and not a server error, # otherwise wait till next time n_txt = citation_text(citation, doi_url, record_doi) description.append({"descriptionType": "Other", "description": n_txt}) response = caltechdata_edit( token, k, {"descriptions": description}, {}, {}, production ) print(response)
def build_usage(caltechdata_collection, usage_collection): """Build collection of records that contain CaltechDATA usage information""" if not os.path.isdir(usage_collection): if not dataset.init(usage_collection): print("Dataset failed to init collection") exit() # Write date to start collecting statistics for new collection dataset.create(usage_collection, "end-date", {"end-date": 1485907200}) # Build out structure for all CaltechDATA records ids = dataset.keys(caltechdata_collection) for k in ids: if dataset.has_key(usage_collection, k) == False: metadata, err = dataset.read(caltechdata_collection, k) # When record was submitted to CaltechDATA: rdate = None submitted = None issued = None if "dates" in metadata: doi = metadata["identifier"]["identifier"] for date in metadata["dates"]: if date["dateType"] == "Submitted": rdate = date["date"] if date["dateType"] == "Updated": submitted = date["date"] if date["dateType"] == "Issued": issued = date["date"] if rdate == None: if submitted != None: rdate = submitted else: rdate = issued else: # Dummy values for junk records rdate = "2020-04-01" doi = "" # Dataset is the only supported type in the spec and we are # following the dataset standards for usage # All dates are the date added to CaltechDATA, which is # the apropriate 'publication' date even if content was available # earlier record_data = { "dataset-id": [{"type": "doi", "value": doi}], "uri": "https://data.caltech.edu/records/" + k, "publisher": "CaltechDATA", "platform": "CaltechDATA", "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}], "yop": rdate.split("-")[0], "data-type": "dataset", "dataset-dates": [{"type": "pub-date", "value": rdate}], "dataset-title": metadata["titles"][0]["title"], "performance": [], "grand-total-unique-investigations": 0, "grand-total-unique-requests": 0, } if not dataset.create(usage_collection, k, record_data): err = dataset.error_message() print(err) exit()
def test_join(t, collection_name): key = "test_join1" obj1 = {"one": 1} obj2 = {"two": 2} if dataset.status(collection_name) == False: t.error("Failed, collection status is False,", collection_name) return ok = dataset.has_key(collection_name, key) err = '' if ok == True: ok = dataset.update(collection_nane, key, obj1) else: ok = dataset.create(collection_name, key, obj1) if ok == False: err = dataset.error_message() t.error( f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}' ) return if dataset.join(collection_name, key, obj2, overwrite=False) == False: err = dataset.error_message() t.error( f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') if obj_result.get('one') != 1: t.error(f'Failed to join append key {key}, {obj_result}') if obj_result.get("two") != 2: t.error(f'Failed to join append key {key}, {obj_result}') obj2['one'] = 3 obj2['two'] = 3 obj2['three'] = 3 if dataset.join(collection_name, key, obj2, overwrite=True) == False: err = dataset.error_message() t.error( f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') for k in obj_result: if k != '_Key' and obj_result[k] != 3: t.error('Failed to update value in join overwrite', k, obj_result)
def submit_report(month_collection, keys, token, production, prefix=None, org="Caltech_Library"): for k in keys: datasets, err = dataset.read(month_collection, k, clean_object=True) if err != "": print(err) datasets = datasets["report-datasets"] dates = datasets[0]["performance"][0]["period"] if prefix != None: filtered = [] for d in datasets: rec_prefix = d["dataset-id"][0]["value"].split("/")[0] if rec_prefix in prefix: filtered.append(d) datasets = filtered # Build report structure today = date.today().isoformat() report = { "report-header": { "report-name": "dataset report", "report-id": "DSR", "release": "rd1", "report-filters": [], "report-attributes": [], "exceptions": [], "created-by": org, "created": today, "reporting-period": { "begin-date": dates["begin-date"], "end-date": dates["end-date"], }, }, "report-datasets": datasets, } if production: url = "https://api.datacite.org/reports/" else: url = "https://api.test.datacite.org/reports/" headers = { "Content-Type": "application/json", "Accept": "application/json", "Authorization": "Bearer %s" % token, } r = requests.post(url, headers=headers, json=report) if r.status_code != 201: print(r.text) print(report) else: print(r.json()["report"]["id"])
def make_link_history(collection, resolver, url, note): """Make an entry in our link history collection""" now = datetime.today().isoformat() # Run checks on both resoler and final URL try: target = requests.get(url) except requests.exceptions.ConnectionError: target = requests.Response() target.status_code = 404 target.url = '' if target.status_code != 200: print( f"Target URL {url} returns Error status code {target.status_code}") if links_differ(target.url, url): print(f"Target URL '{url}' redirects to '{target.url}'") try: get = requests.get(f"https://resolver.library.caltech.edu/{resolver}") except requests.exceptions.ConnectionError: get = requests.Response() get.status_code = 404 get.url = '' if links_differ(get.url, url): print(f"Mismatch between expected url '{url}' and actual '{get.url}'") if get.status_code != 200: print( f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}" ) entry = { "expected-url": url, "url": get.url, "modified": now, "code": get.status_code, "note": note, } # If existing, push into history if dataset.has_key(collection, resolver): existing, err = dataset.read(collection, resolver) if err != "": print(err) exit() if save_history(existing, url, get): past_history = existing.pop("history") past_history.append(existing) entry["history"] = past_history if not dataset.update(collection, resolver, entry): print(dataset.error_message()) exit() else: entry["history"] = [] if not dataset.create(collection, resolver, entry): print(dataset.error_message()) exit()
def update_datacite_metadata(collection, token, access): """Access contains username, password, and prefix for DataCite""" keys = dataset.keys(collection) for a in access: username = a["username"] password = a["password"] prefix = a["prefix"] # Initialize the MDS client. d = DataCiteMDSClient( username=username, password=password, prefix=prefix, url="https://mds.datacite.org", ) for k in keys: print(k) metadata, err = dataset.read(collection, k) if err != "": print(err) exit() # Get rid of Key from dataset metadata.pop("_Key") if "identifier" in metadata: record_doi = metadata["identifier"]["identifier"] # Handle records with 4.3 metadata elements if "schemaVersion" in metadata: metadata.pop("schemaVersion") if "types" in metadata: metadata.pop("types") if record_doi.split("/")[0] == prefix: result = schema40.validate(metadata) # Debugging if this fails if result == False: print(metadata) v = schema40.validator.validate(metadata) errors = sorted(v.iter_errors(instance), key=lambda e: e.path) for error in errors: print(error.message) exit() xml = schema40.tostring(metadata) response = d.metadata_post(xml) print(response)
def get_multiple_links(input_collection, output_collection): keys = dataset.keys(input_collection) for k in keys: record, err = dataset.read(input_collection, k) if err != "": print(err) exit() if "relatedIdentifiers" in record: idvs = [] for idv in record["relatedIdentifiers"]: idvs.append(idv["relatedIdentifier"]) for idv in record["relatedIdentifiers"]: count = idvs.count(idv["relatedIdentifier"]) if count > 1: print("DUPE") print(k) print(idv["relatedIdentifier"])
def match_codemeta(): collection = "github_records.ds" keys = dataset.keys(collection) for k in keys: existing, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") if "completed" not in existing: print("Processing new record ", k) if dataset.attachments(collection, k) != "": dataset.detach(collection, k) # Update CaltechDATA token = os.environ["TINDTOK"] infile = open("codemeta.json", "r") try: meta = json.load(infile) except: print("Invalid json file - Skipping forever ", k) else: standardized = codemeta_to_datacite(meta) # Check that all records have a GitHub subject tag add = True for s in standardized["subjects"]: if s["subject"] == "Github": add = False if s["subject"] == "GitHub": add = False if add == True: standardized["subjects"].append({"subject": "GitHub"}) response = caltechdata_edit(token, k, standardized, {}, {}, True) print(response) os.system("rm codemeta.json") existing["completed"] = "True" if not dataset.update(collection, k, existing): err = dataset.error_message() print(f"Unexpected error on read: {err}")
def get_history(collection, caltechdata_collection, caltechdata_keys): """Harvest the history of records from CaltechDATA.""" keys_to_update = [] if os.path.exists("historyupdate"): with open("historyupdate", "r") as infile: update = date.fromisoformat(infile.read()) else: # Arbitrary old date - everything will be updated update = date(2011, 1, 1) for k in progressbar(caltechdata_keys, redirect_stdout=True): existing, err = dataset.read(caltechdata_collection, k) if err != "": print(f"Unexpected error on read: {err}") record_update = datetime.fromisoformat(existing["updated"]).date() if record_update > update: keys_to_update.append(k) if not os.path.isdir(collection): if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = "https://data.caltech.edu/records/" for k in progressbar(keys_to_update): url = base_url + str(k) + "/revisions" response = requests.get(url) revisions = response.json() for num, metadata in enumerate(revisions): key = f"{k}-{num}" if dataset.has_key(collection, key) == False: dataset.create(collection, key, metadata) # Save date in file today = date.today().isoformat() with open("historyupdate", "w") as outfile: outfile.write(today)
def add_files(collection): #Run through all elements in collection keys = dataset.keys(collection) for k in keys: record, err = dataset.read(collection, k) if err != '': print(err) exit() url = record['url_links'] print('Processing file from ', url) #Make a dummy file to represent results from kallisto files = ['example_file' + k] for f in files: with open(f, "w") as file: file.write(" 0 1 0 " + k) #Now attach file to collection err = dataset.attach(collection, k, files) if err != '': print(err) exit() #Cleanup local disk for f in files: os.remove(f)
def migrate_attachment(c_name, key): obj, err = dataset.read(c_name, key) obj_path = dataset.path(c_name, key).replace(key + ".json", "") tarball = os.path.join(obj_path, key + ".tar") if os.path.exists(tarball): tar = tarfile.open(tarball) tar.extractall() tar.close() files = os.listdir() # Prune _Attachment from object and resave if "_Attachments" in obj: del obj["_Attachments"] err = dataset.update(c_name, key, obj) if err != "": print(f"Can't remove _Attachments metadata, {err}") sys.exit(1) for fname in files: print(".", end="") reattach(c_name, key, "v0.0.0", fname) os.remove(fname) # NOTE: if all re-attached then we need to remove tarball too os.remove(tarball) sys.stdout.flush()
def get_crossref_refs(prefix, done=False, new=True): # New=True will download everything from scratch and delete any existing records collection = "crossref_refs.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = ( "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix=" + prefix) collected = dataset.has_key(collection, "captured") cursor = "" count = 0 while cursor != None: if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("error on read: " + err) date = date["captured"] print(date) url = base_url + "&from-collected-date=" + date else: url = base_url if cursor != "": url = url + "&cursor=" + cursor print(url) r = requests.get(url) records = r.json() if records["status"] == "failed": print(records) break for rec in records["message"]["events"]: # Save results in dataset print(count, rec["id"]) count = count + 1 # Just for prettyness if not dataset.create(collection, rec["id"], rec): err = dataset.error_message() print("Error in saving record: " + err) if cursor == records["message"]["next-cursor"]: # Catches bug where we get the same curser back at end of results break if records["message"]["total-results"] > count: cursor = records["message"]["next-cursor"] else: cursor = None if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("Error in reading date: " + err) date = date["captured"] # Check Deleted cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Delete results in dataset print("Deleted: ", rec["id"]) if not dataset.delete(collection, rec["id"]): err = dataset.error_message() print(f"Unexpected error on read: {err}") cursor = records["message"]["next-cursor"] # Check Edited cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Update results in dataset print("Update: ", rec["id"]) if not dataset.update(collection, rec["id"], rec): err = dataset.error_message() print(f"Unexpected error on write: {err}") cursor = records["message"]["next-cursor"] if done: date = datetime.date.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): if not dataset.update(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on update: {err}") else: if not dataset.create(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on create: {err}")
def send_simple_message(token, matched): matched_key = matched[0] matched_dois = matched[1] # Use raw api call to get email api_url = "https://data.caltech.edu/api/record/" r = requests.get(api_url + matched_key) r_data = r.json() if "message" in r_data: raise AssertionError("id " + idv + " expected http status 200, got " + r_data.status + r_data.message) if not "metadata" in r_data: raise AssertionError( "expected as metadata property in response, got " + r_data) metadata = r_data["metadata"] email = "" name = "" if "contributors" in metadata: for c in metadata["contributors"]: if c["contributorType"] == "ContactPerson": if "contributorEmail" in c: email = c["contributorEmail"] name = c["contributorName"] if email == "": print("Missing email for record ", matched_key) else: # Use dataset version to get datacite metadata metadata, err = dataset.read("caltechdata.ds", matched_key) if err != "": print(f"Unexpected error on read: {err}") exit() title = metadata["titles"][0]["title"] doi = metadata["identifier"]["identifier"] headers = {"Accept": "text/bibliography;style=apa"} citation_block = "" for matched in matched_dois: citation = requests.get(matched, headers=headers) citation.encoding = "utf-8" citation = citation.text citation = su.unescape(citation) citation_block = citation_block + "<p>" + citation + "</p>" # Send email return requests.post( "https://api.mailgun.net/v3/notices.caltechlibrary.org/messages", auth=("api", token), files=[("inline", open("CaltechDATA_Logo_cropped.png", "rb"))], data={ "from": "CaltechDATA Notices <*****@*****.**>", "to": name + " <" + email + ">, Tom Morrell <*****@*****.**>", "subject": "Your CaltechDATA Work has been cited!", "html": '<html> <center> <img src="cid:CaltechDATA_Logo_cropped.png"\ alt="CaltechDATA Logo" width="249" height="69"> </center> \ <p> Dear ' + name + ', </p>\ <p>Your CaltechDATA work "' + title + '" has been cited\ in:</p>' + citation_block + '<p>The\ citation(s) are now listed in your CaltechDATA record at \ <a href="https://doi.org/' + doi + '">' + doi + '</a>.</p>\ <p> Best, </p><p>CaltechDATA Alerting Service</p><hr>\ <p> Is this incorrect? Let us know at\ <a href="mailto:[email protected]?Subject=Issue%20with%20citation%20link%20between%20' + doi + "%20and%20" + ",".join(matched_dois) + '">[email protected]</a></p>\ <P> This email was sent by the Caltech Library, \ 1200 East California Blvd., MC 1-43, Pasadena, CA 91125, USA </p> </html>', }, )
import sys from datetime import datetime from py_dataset import dataset # # Loop through the keys, fetch the record and append a _Key: "deposit" to # each object. # c_name = "people.ds" keys = dataset.keys(c_name) #print(f"DEBUG Keys: {keys}") for key in keys: print(f"Fixing key {key}") data, err = dataset.read(c_name, key) if err != "": print(f"Error read {c_name} -> {key}, {err}") sys.exit(1) # Make fieldname lower case dt = datetime.now().strftime('%Y-%m-%d %H:%I:%S') obj = { "_Key": key, "_State": "deposit", "_Updated": f"{dt}", "_Created": f"{dt}" } for field in data: fkey = field.lower() if not ' ' in fkey: obj[fkey] = data[field]
import_coll = "imported.ds" os.system("rm -rf imported.ds") dataset.init(import_coll) os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json" err = dataset.import_gsheet(import_coll, sheet, 'Sheet1', 1, 'A:CZ') if err != '': print(err) keys = dataset.keys(import_coll) coauthors = [] count = 0 for key in progressbar(keys, redirect_stdout=True): record, err = dataset.read(name, key) if err != "": print(err) count = 0 if 'identifiers' in record: identifiers = record['identifiers'] else: identifiers = [] print(key) print(record) affiliations = record['affiliations'] authors = record['authors'].split(';') link = record['link'] year = record['year'] for a in authors: #If none of the words in remove_words appears, we have an author
archive_path = 'https://wayback.archive-it.org/9060/' err = dataset.import_gsheet(collection, sheet_id, sheet_name, 1, cell_range, overwrite=True) if err != '': print(f"Unexpected error on importing gsheet to {collection}, {err}") exit() keys = dataset.keys(collection) for key in keys: inputv, err = dataset.read(collection, key) if err != "": print(f"Unexpected error for {key} in {collection}, {err}") exit() #If we haven't assigned a doi for this resource before if 'doi' not in inputv: #Confirm that archiving is successful if 'archive_complete' in inputv: if inputv['archive_complete'] == 'Yes': metadata = {} metadata['titles'] = [{'title': inputv['title']}] authors = [] alist = inputv['author'].split(';') if 'affiliation' in inputv: aff_list = inputv['affiliation'].split(';') else:
def update_datacite_media(username, password, collection, prefix): keys = dataset.keys(collection) if path.exists("mediaupdate"): with open("mediaupdate", "r") as infile: update = date.fromisoformat(infile.read()) else: # Arbitrary old date - everything will be updated update = date(2011, 1, 1) for k in progressbar(keys, redirect_stdout=True): existing, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") atlas = False subjects = existing["subjects"] for subject in subjects: if (subject["subject"].strip() == "Atlas of Bacterial and Archaeal Cell Structure"): atlas = True record_update = datetime.fromisoformat(existing["updated"]).date() # Subtraction to get window to grab records that were updated between runs if record_update > update - timedelta(days=2): if "electronic_location_and_access" in existing: doi = existing["identifier"]["identifier"] record_prefix = doi.split("/")[0] if record_prefix == prefix: delete_datacite_media(username, password, doi) for file_met in existing["electronic_location_and_access"]: url = "https://mds.datacite.org/media/" + doi headers = { "Content-Type": "application/txt;charset=UTF-8" } extension = file_met["electronic_name"][0].split( ".")[-1] filename = file_met["electronic_name"][0].split(".")[0] data = {} if extension == "nc": data = ("application/x-netcdf=" + file_met["uniform_resource_identifier"]) elif extension == "mp4": if atlas: data = ( "video/mp4=" + "https://www.cellstructureatlas.org/videos/" + filename + ".mp4") else: data = ( "video/mp4=" + file_met["uniform_resource_identifier"]) elif extension == "mj2": data = ("video/mj2=" + file_met["uniform_resource_identifier"]) elif extension == "avi": data = ("video/avi=" + file_met["uniform_resource_identifier"]) elif extension == "mov": data = ("video/quicktime=" + file_met["uniform_resource_identifier"]) elif extension == "gz": data = ("application/gzip=" + file_met["uniform_resource_identifier"]) elif extension == "zip": data = ("application/zip=" + file_met["uniform_resource_identifier"]) elif extension == "h5ad": data = ("application/octet-stream=" + file_met["uniform_resource_identifier"]) if data != {}: print(doi) print(data) r = requests.post( url, data=data.encode("utf-8"), auth=(username, password), headers=headers, ) print(r)
def add_thesis_doi(data_collection, thesis_collection, token, production=True): """Add in theis DOI to CaltechDATA records""" # Search across CaltechTHESIS DOIs dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] labels = ["eprint_id", "doi", "official_url", "related_url"] keys = dataset.keys(thesis_collection) all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) dois = [] for metadata in progressbar(all_metadata, redirect_stdout=True): if "doi" in metadata: record_doi = metadata["doi"].strip() if "related_url" in metadata and "items" in metadata["related_url"]: items = metadata["related_url"]["items"] for item in items: if "url" in item: url = item["url"].strip() if "type" in item: itype = item["type"].strip().lower() if itype == "doi": if idutils.is_doi(url): doi = "10." + url.split("10.")[1] prefix = doi.split("/")[0] if prefix == "10.22002": dois.append([doi, record_doi]) else: print("Ignoring non-DOI") print(metadata["eprint_id"]) print(url.split("10.")) for doi_link in dois: cd_doi = doi_link[0] thesis_doi = doi_link[1] print("Checking " + cd_doi) if "D1" in cd_doi: record_number = cd_doi.split("D1.")[1] if "d1" in cd_doi: record_number = cd_doi.split("d1.")[1] record, err = dataset.read(data_collection, record_number) if err != "": print(err) exit() done = False if "relatedIdentifiers" in record: for idv in record["relatedIdentifiers"]: identifier = idv["relatedIdentifier"] if identifier == thesis_doi: done = True if done == False: identifiers = record["relatedIdentifiers"] identifiers.append( { "relatedIdentifier": thesis_doi, "relatedIdentifierType": "DOI", "relationType": "IsSupplementTo", } ) new_metadata = {"relatedIdentifiers": identifiers} else: new_metadata = { "relatedIdentifiers": [ { "relatedIdentifier": thesis_doi, "relatedIdentifierType": "DOI", "relationType": "IsSupplementTo", } ] } if done == False: print("Adding " + thesis_doi + " to " + cd_doi) response = caltechdata_edit( token, record_number, new_metadata, {}, {}, True ) print(response)
def add_usage(collection, token, usage_collection, production=True): """Add in usage text in the description field""" keys = dataset.keys(collection) biggest_views = 0 biggest_views_record = "" biggest_downloads = 0 biggest_downloads_record = "" total_views = 0 total_downloads = 0 for k in keys: record, err = dataset.read(collection, k) if err != "": print(err) exit() usage, err = dataset.read(usage_collection, k) views = usage["grand-total-unique-investigations"] downloads = usage["grand-total-unique-requests"] if views > biggest_views: biggest_views = views biggest_views_record = k if downloads > biggest_downloads: biggest_downloads = downloads biggest_downloads_record = k total_views += views total_downloads += downloads date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) now = datetime.today() first = date.strftime("%B %d, %Y") last = now.strftime("%B %d, %Y") if views > 1: u_txt = ( "<br>Unique Views: " + str(views) + "<br>Unique Downloads: " + str(downloads) + "<br> between " + first + " and " + last + '<br><a href="https://data.caltech.edu/stats"' + ">More info on how stats are collected</a><br>" ) description = record["descriptions"] use_exists = False for d in description: descr_text = d["description"] # We always update an existing listing if descr_text.startswith("<br>Unique Views:"): d["description"] = u_txt use_exists = True # Otherwise we add a new one if use_exists == False: description.append({"descriptionType": "Other", "description": u_txt}) response = caltechdata_edit( token, k, {"descriptions": description}, {}, {}, production ) print(response) print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") print(f"Most views {biggest_views} for record {biggest_views_record}") print(f"Total downloads {total_downloads}") print(f"Total views {total_views}")
def get_usage(usage_collection, mapping, token): """Collect usage into a usage object for items in CaltechDATA""" # Find time periods datev, err = dataset.read(usage_collection, "end-date") new_start = datetime.fromtimestamp(datev["end-date"]) now = datetime.now().timestamp() # minutes in range minutes_diff = math.ceil( (datetime.fromtimestamp(now) - new_start).total_seconds() / 60.0 ) # Get number of visitors since last harvest stats_url_base = "https://stats.tind.io/index.php?module=API&method=Live.getCounters&idSite=1161&format=JSON" token_s = "&token_auth=" + token stats_url = f"{stats_url_base}{token_s}&lastMinutes={minutes_diff}" response = requests.get(stats_url) if response.status_code != 200: print(response.text) print(stats_url) visitors = response.json()[0]["visits"] print(visitors) visit_url_base = "https://stats.tind.io/index.php?module=API&method=Live.getLastVisitsDetails&idSite=1161&format=json&filter_limit=1000" print("Getting usage") usage = [] # We will page through visitors in chunks of 1000 chunks = math.ceil(int(visitors) / 1000) if chunks > 1: url = visit_url_base + token_s + "&filter_limit=1000" process_visits(url, mapping) for c in progressbar(range(chunks)): url = f"{visit_url_base}{token_s}&filter_limit=1000&filter_offset={c*1000}" usage += process_visits(url, mapping) else: url = f"{visit_url_base}{token_s}&filter_limit={visitors}" usage = process_visits(url, mapping) print("Writing usage") for use in progressbar(usage): date = use["date"] if "downloads" in use and "views" in use: records = use["views"].union(use["downloads"]) elif "views" in use: records = use["views"] else: records = use["downloads"] for rec in records: data, err = dataset.read(usage_collection, rec) if err == "": # We only track usage from live records instance = {"instance": [], "period": date} if "views" in use: if rec in use["views"]: instance["instance"].append( { "access-method": "regular", "count": 1, "metric-type": "unique-dataset-investigations", } ) # print(data,rec) data["grand-total-unique-investigations"] += 1 if "downloads" in use: if rec in use["downloads"]: instance["instance"].append( { "access-method": "regular", "count": 1, "metric-type": "unique-dataset-requests", } ) data["grand-total-unique-requests"] += 1 data["performance"].append(instance) dataset.update(usage_collection, rec, data) dataset.update(usage_collection, "end-date", {"end-date": now})
def fix_multiple_links(input_collection, token): keys = dataset.keys(input_collection) for k in keys: record, err = dataset.read(input_collection, k) if err != "": print(err) exit() if "relatedIdentifiers" in record: idvs = [] new = [] dupes = [] replace = False record_doi = record["identifier"]["identifier"] for idv in record["relatedIdentifiers"]: idvs.append(idv["relatedIdentifier"]) for idv in record["relatedIdentifiers"]: identifier = idv["relatedIdentifier"] if identifier == record_doi: # Having a related identifier that is the same as the record # doi doesn't make any sense replace = True dupes.append(identifier) else: count = idvs.count(identifier) if count > 1: replace = True if identifier not in dupes: # We need to save the first duplicate new.append(idv) # Add to list of those already saved dupes.append(identifier) else: # This will be deleted dupes.append(identifier) else: # Save all unique ids new.append(idv) if replace == True: print("Duplicate links found in record ", k) print("Will delete these links", dupes) response = input("Do you approve this change? Y or N") new_metadata = {"relatedIdentifiers": new} if response == "Y": response = caltechdata_edit(token, k, new_metadata, {}, {}, True) print(response) if "alternateIdentifiers" in record: idtypes = [] alt_ids = [] repeat = False for idv in record["alternateIdentifiers"]: if idv["alternateIdentifierType"] not in idtypes: # If we haven't seen id type before, save it alt_ids.append(idv) idtypes.append(idv["alternateIdentifierType"]) else: repeat = True print("Will Delete Repeated ID ", idv["alternateIdentifier"]) if repeat == True: new_metadata = {"alternateIdentifiers": alt_ids} response = caltechdata_edit(token, k, new_metadata, {}, {}, True) print(response)
def match_cd_refs(): token = os.environ["TINDTOK"] matches = [] collection = "caltechdata.ds" keys = dataset.keys(collection) if "mediaupdate" in keys: keys.remove("mediaupdate") # Get event data results event_data = "crossref_refs.ds" event_keys = dataset.keys(event_data) event_keys.remove("captured") f_name = "match_cd_refs" dot_paths = [".obj_id", ".id", ".subj_id"] labels = ["obj_id", "id", "subj_id"] print("Getting Event Data Records") if dataset.has_frame(event_data, f_name): if not dataset.frame_reframe(event_data, f_name, event_keys): err = dataset.error_message() print(f"Failed to reframe {f_name} in {event_data}, {err}") exit() elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): err = dataset.error_message() print(f"Failed to create frame {f_name} in {event_data}, {err}") exit() grid = dataset.frame_grid(event_data, f_name) df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) grouped = df.groupby(["obj_id"]) groups = grouped.groups # Look at all CaltechDATA records for k in keys: # Collect matched new links for the record record_matches = [] print(k) metadata, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") doi = "https://doi.org/" + metadata["identifier"]["identifier"] if doi in groups: hits = grouped.get_group(doi) for index, h in hits.iterrows(): # Trigger for whether we already have this link new = True if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: if m["relatedIdentifier"] in h["subj_id"]: new = False if new == True: match = h["subj_id"] print(match) print(h["obj_id"]) inputv = input("Do you approve this link? Type Y or N: ") if inputv == "Y": record_matches.append(match) # If we have to update record if len(record_matches) > 0: ids = [] if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: ids.append(m) matches.append([k, record_matches]) # Now collect identifiers for record for match in record_matches: split = match.split("doi.org/") new_id = { "relatedIdentifier": split[1], "relatedIdentifierType": "DOI", "relationType": "IsCitedBy", } ids.append(new_id) newmetadata = {"relatedIdentifiers": ids} response = caltechdata_edit(token, k, newmetadata, {}, {}, True) print(response) return matches
from ames.harvesters import get_caltechfeed, get_records if __name__ == "__main__": import_coll = "imported.ds" sheet = "1ZI3-XvQ_3rLcKrF-4FBa2tEInIdQfOnGJ9L_NmhmoGs" os.system("rm -rf imported.ds") dataset.init(import_coll) err = dataset.import_gsheet(import_coll, sheet, "CaltechPEOPLE", 4, "A:AA") if err != "": print(err) people_list = dataset.keys(import_coll) people = [] for p in people_list: record, err = dataset.read(import_coll, p) people.append(record) # Profiles collection from feeds profile_ds = "profiles.ds" keys = dataset.keys(profile_ds) labels = ["orcid", "creator_id"] dot_paths = [".orcid", ".creator_id"] all_metadata = get_records(dot_paths, "profile", profile_ds, keys, labels) for profile in all_metadata: if "creator_id" in profile: idv = profile["creator_id"] else: print("ERROR", profile) for person in people:
def test_basic(t, collection_name): '''test_basic(collection_name) runs tests on basic CRUD ops''' # Setup a test record key = "2488" value = { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488" } # We should have an empty collection, we will create our test record. if dataset.create(collection_name, key, value) == False: err = dataset.error_message() t.error(f'create({collection_name}, {key}, {value}) failed, {err}') return # Check to see that we have only one record key_count = dataset.count(collection_name) if key_count != 1: t.error(f"Failed, expected count to be 1, got {key_count}") # Do a minimal test to see if the record looks like it has content keyList = dataset.keys(collection_name) rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error(f"epxected {rec[k]} got {v}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error(f"Failed, expected {k} with list v, got {v}") # Test updating record value["verified"] = True if dataset.update(collection_name, key, value) == False: err = dataset.error_message() t.error(f"update({collection_name}, {key}, {value}) failed, {err}") rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error("expected {rec[k]} got {v} for key {k}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error("Failed, expected {k} with a list for v, got {v}") # Test path to record expected_s = "/".join( [collection_name, "pairtree", "24", "88", (key + ".json")]) expected_l = len(expected_s) p = dataset.path(collection_name, key) if len(p) != expected_l: t.error("Failed, expected length", expected_l, "got", len(p)) if p != expected_s: t.error("Failed, expected", expected_s, "got", p) # Test listing records l = dataset.list(collection_name, [key]) if len(l) != 1: t.error( f"list({collection_name}, [{key}]) failed, list should return an array of one record, got", l) return # test deleting a record if dataset.delete(collection_name, key) == False: err = dataset.error_message() t.error("Failed, could not delete record", key, ", ", err)
def test_keys(t, collection_name): '''test_keys(collection_name) test getting, filter and sorting keys''' # Test count after delete key_list = dataset.keys(collection_name) cnt = dataset.count(collection_name) if cnt != 0: t.error("Failed, expected zero records, got", cnt, key_list) # # Generate multiple records for collection for testing keys # test_records = { "gutenberg:21489": { "title": "The Secret of the Island", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "http://www.gutenberg.org/ebooks/21489", "categories": "fiction, novel" }, "gutenberg:2488": { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488", "categories": "fiction, novel" }, "gutenberg:21839": { "title": "Sense and Sensibility", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jane", "family": "Austin" }], "url": "http://www.gutenberg.org/ebooks/21839", "categories": "fiction, novel" }, "gutenberg:3186": { "title": "The Mysterious Stranger, and Other Stories", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Mark", "family": "Twain" }], "url": "http://www.gutenberg.org/ebooks/3186", "categories": "fiction, short story" }, "hathi:uc1321060001561131": { "title": "A year of American travel - Narrative of personal experience", "formats": ["pdf"], "authors": [{ "given": "Jessie Benton", "family": "Fremont" }], "url": "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9", "categories": "non-fiction, memoir" } } test_count = len(test_records) for k in test_records: v = test_records[k] if dataset.create(collection_name, k, v) == False: err = dataset.error_message() t.error("Failed, could not add", k, "to", collection_name, ', ', err) # Test keys, filtering keys and sorting keys all_keys = dataset.keys(collection_name) if len(all_keys) != test_count: t.error("Expected", test_count, "all_keys back, got", keys) #dataset.verbose_on() filter_expr = '(eq .categories "non-fiction, memoir")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 1: t.error( f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for", filter_expr, "got", filtered_keys) filter_expr = '(contains .categories "novel")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 3: t.error( f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for", filter_expr, "got", filtered_keys) sort_expr = '+.title' filter_expr = '(contains .categories "novel")' sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr) if len(sorted_keys) != 3: t.error( f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for", filter_expr, "got", sorted_keys) expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"] for i, k in enumerate(expected_keys): if i < len(sorted_keys) and sorted_keys[i] != k: obj1, _ = dataset.read(collection_name, k) obj2, _ = dataset.read(collection_name, sorted_keys[i]) t.error( f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")' )
def get_wos_refs(new=True): # New=True will download everything from scratch and delete any existing records collection = "all_wos.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: ok = dataset.init(collection) if ok == False: print("Dataset failed to init collection") exit() # Get access token from WOS sed as environment variable with source token.bash token = os.environ["WOSTOK"] headers = {"X-ApiKey": token, "Content-type": "application/json"} # Run query to get scope of records base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK" collected = dataset.has_key(collection, "captured") if collected == True: date = dataset.read(collection, "captured") date = date[0]["captured"] date = datetime.fromisoformat(date) current = datetime.today() diff = current - date base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D" date = datetime.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): err = dataset.update(collection, "captured", record) if err != "": print(f"Unexpected error on update: {err}") else: err = dataset.create(collection, "captured", record) if err != "": print(f"Unexpected error on create: {err}") query = "OG=(California Institute of Technology)" query = urllib.parse.quote_plus(query) url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1" response = requests.get(url, headers=headers) response = response.json() record_count = response["QueryResult"]["RecordsFound"] print(record_count, " Records from WOS") query_id = response["QueryResult"]["QueryID"] try: records = response["Data"]["Records"]["records"]["REC"] except: print(response) write_records(records, collection) # We have saved the first 100 records record_start = 101 record_count = record_count - 100 query_url = "https://api.clarivate.com/api/wos/query/" while record_count > 0: print(record_count) print(len(records), "records") if record_count > 100: url = ( query_url + str(query_id) + "?count=100&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() try: records = response["Records"]["records"]["REC"] except: print(response) write_records(records, collection) record_start = record_start + 100 record_count = record_count - 100 else: url = ( query_url + str(query_id) + "?count=" + str(record_count) + "&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() records = response["Records"]["records"]["REC"] write_records(records, collection) record_count = 0 print("Downloaded all records ")
if err != "": print(f"{c_name}, {err}") harvest = False if harvest == True: username = os.environ["USER"] password = os.environ["PW"] returnc = ep_full( c_name, "https://caltechcampuspubs.library.caltech.edu/", username, password ) print(returnc) keys = dataset.keys(c_name) for key in keys: existing, err = dataset.read(c_name, key) # print(existing) new = { "_access": {"metadata_restricted": False, "files_restricted": False}, "_owners": [1], "_created_by": 1, "_default_preview": "previewer one", "access_right": "open", "resource_type": {"type": "publication", "subtype": "publication-other"}, } new["recid"] = existing["eprint_id"] new["titles"] = [{"title": existing["title"], "type": "MainTitle"}] crea = [] if "creators" in existing: for creator in existing["creators"]["items"]: cre = {
def aggregate_usage(usage_collection, month_collection): keys = dataset.keys(usage_collection) keys.remove("end-date") for k in progressbar(keys): record, err = dataset.read(usage_collection, k) if err != "": print(err) use = {} views = {} for usage in record["performance"]: split = usage["period"].split("-") month = split[0] + "-" + split[1] for u in usage["instance"]: metric = u["metric-type"] if metric == "unique-dataset-requests": if month in use: use[month] += u["count"] else: use[month] = u["count"] if metric == "unique-dataset-investigations": if month in views: views[month] += u["count"] else: views[month] = u["count"] # Strip non-counter stuff record.pop("_Key") record.pop("grand-total-unique-requests") record.pop("grand-total-unique-investigations") # go across months for view in views: split = view.split("-") date_obj = datetime(int(split[0]), int(split[1]), 1) d_range = get_month_day_range(date_obj) performance = [ { "period": { "begin-date": d_range[0].date().isoformat(), "end-date": d_range[1].date().isoformat(), }, "instance": [], } ] v = views[view] performance[0]["instance"].append( { "count": v, "metric-type": "unique-dataset-investigations", "access-method": "regular", } ) # Handle when we have both views and uses in a given month if view in use: u = use[view] performance[0]["instance"].append( { "count": u, "metric-type": "unique-dataset-requests", "access-method": "regular", } ) existing, err = dataset.read(month_collection, view) if err != "": print(err) record["performance"] = performance existing["report-datasets"].append(record) if not dataset.update(month_collection, view, existing): err = dataset.error_message() print(err) for use_date in use: # We only have use-only records left to handle if use_date not in views: u = use[use_date] split = use_date.split("-") date_obj = datetime(int(split[0]), int(split[1]), 1) d_range = get_month_day_range(date_obj) performance = [ { "period": { "begin-date": d_range[0].date().isoformat(), "end-date": d_range[1].date().isoformat(), }, "instance": [ { "count": u, "metric-type": "unique-dataset-requests", "access-method": "regular", } ], } ] existing, err = dataset.read(month_collection, view) if err != "": print(err) record["performance"] = performance existing["report-datasets"].append(record) if not dataset.update(month_collection, view, existing): err = dataset.error_message() print(err)