def file_mapping(source_collection): """Return a dictionary that maps /tindfiles/serve urls to records.""" mapping = {} dot_paths = [".electronic_location_and_access", "._Key"] keys = dataset.keys(source_collection) metadata = get_records(dot_paths, "files", source_collection, keys) for record in metadata: # Handle history records where the key is the item and revision k = record["_Key"] if "-" in k: rec_id = k.split("-")[0] else: rec_id = k # Ignore embargoed records if "electronic_location_and_access" in record: for filev in record["electronic_location_and_access"]: url = filev["uniform_resource_identifier"] # name = filev['electronic_name'][0] if url not in mapping: mapping[url] = rec_id return mapping
def get_subset(collection): #Demo pulling out a subset of records from collection #Using pandas data feame #Get all files with cell type Xenotransplanted microglia keys = dataset.keys(collection) dot_paths = [".cell_source", ".species", ".tissue", "._Key"] (grid, err) = dataset.grid(collection, keys, dot_paths) if err != "": print(err) exit() df = pd.DataFrame(np.array(grid), columns=["source", "species", "tissue", "key"]) grouped = df.groupby(["source"]) print(grouped.groups.keys()) records = grouped.get_group('Xenotransplanted microglia') for index, r in records.iterrows(): print('getting files for ', r['key']) err = dataset.detach(collection, r['key'], []) if err != '': print(err) #Example doing the same thing with frames labels = ["source", "species", "tissue", "key"] f, err = dataset.frame(collection, 'frame_name', keys, dot_paths, labels) if err != "": print(err) records = dataset.frame_objects(collection, 'frame_name') for record in records: if record['source'] == 'Xenotransplanted microglia': print('getting files for ', record['key']) err = dataset.detach(collection, record['key'], []) if err != '': print(err)
def add_citation(collection, token, production=True): """Add in example citation text in the description field""" keys = dataset.keys(collection) for k in keys: record, err = dataset.read(collection, k) if err != "": print(err) exit() description = record["descriptions"] cite_exists = False for d in description: descr_text = d["description"] if descr_text.startswith("<br>Cite this record as:"): cite_exists = True if cite_exists == False: record_doi = record["identifier"]["identifier"] headers = {"Accept": "text/x-bibliography; style=apa"} citation_link = "https://doi.org/" citation = requests.get(citation_link + record_doi, headers=headers).text doi_url = "https://doi.org/" + record_doi if doi_url in citation: # Check that we have a citation and not a server error, # otherwise wait till next time n_txt = citation_text(citation, doi_url, record_doi) description.append({"descriptionType": "Other", "description": n_txt}) response = caltechdata_edit( token, k, {"descriptions": description}, {}, {}, production ) print(response)
def build_usage(caltechdata_collection, usage_collection): """Build collection of records that contain CaltechDATA usage information""" if not os.path.isdir(usage_collection): if not dataset.init(usage_collection): print("Dataset failed to init collection") exit() # Write date to start collecting statistics for new collection dataset.create(usage_collection, "end-date", {"end-date": 1485907200}) # Build out structure for all CaltechDATA records ids = dataset.keys(caltechdata_collection) for k in ids: if dataset.has_key(usage_collection, k) == False: metadata, err = dataset.read(caltechdata_collection, k) # When record was submitted to CaltechDATA: rdate = None submitted = None issued = None if "dates" in metadata: doi = metadata["identifier"]["identifier"] for date in metadata["dates"]: if date["dateType"] == "Submitted": rdate = date["date"] if date["dateType"] == "Updated": submitted = date["date"] if date["dateType"] == "Issued": issued = date["date"] if rdate == None: if submitted != None: rdate = submitted else: rdate = issued else: # Dummy values for junk records rdate = "2020-04-01" doi = "" # Dataset is the only supported type in the spec and we are # following the dataset standards for usage # All dates are the date added to CaltechDATA, which is # the apropriate 'publication' date even if content was available # earlier record_data = { "dataset-id": [{"type": "doi", "value": doi}], "uri": "https://data.caltech.edu/records/" + k, "publisher": "CaltechDATA", "platform": "CaltechDATA", "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}], "yop": rdate.split("-")[0], "data-type": "dataset", "dataset-dates": [{"type": "pub-date", "value": rdate}], "dataset-title": metadata["titles"][0]["title"], "performance": [], "grand-total-unique-investigations": 0, "grand-total-unique-requests": 0, } if not dataset.create(usage_collection, k, record_data): err = dataset.error_message() print(err) exit()
def test_issue12(t, c_name): src = '''[ {"id": "1", "c1": 1, "c2": 2, "c3": 3 }, {"id": "2", "c1": 2, "c2": 2, "c3": 3 }, {"id": "3", "c1": 3, "c2": 3, "c3": 3 }, {"id": "4", "c1": 1, "c2": 1, "c3": 1 }, {"id": "5", "c1": 6, "c2": 6, "c3": 6 } ]''' #dataset.verbose_on() # DEBUG #dataset.use_strict_dotpath(True) # DEBUG if dataset.status(c_name) == False: if not dataset.init(c_name): err = dataset.error_message() t.error(f'failed to create {c_name}') return objects = json.loads(src) for obj in objects: key = obj['id'] if dataset.has_key(c_name, key): dataset.update(c_name, key, obj) else: dataset.create(c_name, key, obj) f_names = dataset.frames(c_name) for f_name in f_names: ok = dataset.delete_frame(c_name, f_name) if ok == False: err = dataset.error_message() t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"') return if dataset.has_frame(c_name, f_name) == True: t.error( f'Failed to delete frame {c_name} from {c_name}, frame still exists' ) return f_name = 'issue12' dot_paths = [".c1", "c3"] labels = [".col1", ".col3"] keys = dataset.keys(c_name) if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels): err = dataset.error_message() t.error(f'failed to create {f_name} from {c_name}, {err}') if not dataset.has_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected frame {f_name} to exists, {err}') return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) == 0: err = dataset.error_message() t.error(f'expected keys in {f_name}, got zero, {err}') return f_objects = dataset.frame_objects(c_name, f_name) if len(f_objects) == 0: err = dataset.error_message() t.error(f'expected objects in {f_name}, got zero, {err}') return if not dataset.delete_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected to delete {f_name} in {c_name}, {err}')
def update_datacite_metadata(collection, token, access): """Access contains username, password, and prefix for DataCite""" keys = dataset.keys(collection) for a in access: username = a["username"] password = a["password"] prefix = a["prefix"] # Initialize the MDS client. d = DataCiteMDSClient( username=username, password=password, prefix=prefix, url="https://mds.datacite.org", ) for k in keys: print(k) metadata, err = dataset.read(collection, k) if err != "": print(err) exit() # Get rid of Key from dataset metadata.pop("_Key") if "identifier" in metadata: record_doi = metadata["identifier"]["identifier"] # Handle records with 4.3 metadata elements if "schemaVersion" in metadata: metadata.pop("schemaVersion") if "types" in metadata: metadata.pop("types") if record_doi.split("/")[0] == prefix: result = schema40.validate(metadata) # Debugging if this fails if result == False: print(metadata) v = schema40.validator.validate(metadata) errors = sorted(v.iter_errors(instance), key=lambda e: e.path) for error in errors: print(error.message) exit() xml = schema40.tostring(metadata) response = d.metadata_post(xml) print(response)
def get_multiple_links(input_collection, output_collection): keys = dataset.keys(input_collection) for k in keys: record, err = dataset.read(input_collection, k) if err != "": print(err) exit() if "relatedIdentifiers" in record: idvs = [] for idv in record["relatedIdentifiers"]: idvs.append(idv["relatedIdentifier"]) for idv in record["relatedIdentifiers"]: count = idvs.count(idv["relatedIdentifier"]) if count > 1: print("DUPE") print(k) print(idv["relatedIdentifier"])
def match_codemeta(): collection = "github_records.ds" keys = dataset.keys(collection) for k in keys: existing, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") if "completed" not in existing: print("Processing new record ", k) if dataset.attachments(collection, k) != "": dataset.detach(collection, k) # Update CaltechDATA token = os.environ["TINDTOK"] infile = open("codemeta.json", "r") try: meta = json.load(infile) except: print("Invalid json file - Skipping forever ", k) else: standardized = codemeta_to_datacite(meta) # Check that all records have a GitHub subject tag add = True for s in standardized["subjects"]: if s["subject"] == "Github": add = False if s["subject"] == "GitHub": add = False if add == True: standardized["subjects"].append({"subject": "GitHub"}) response = caltechdata_edit(token, k, standardized, {}, {}, True) print(response) os.system("rm codemeta.json") existing["completed"] = "True" if not dataset.update(collection, k, existing): err = dataset.error_message() print(f"Unexpected error on read: {err}")
def add_files(collection): #Run through all elements in collection keys = dataset.keys(collection) for k in keys: record, err = dataset.read(collection, k) if err != '': print(err) exit() url = record['url_links'] print('Processing file from ', url) #Make a dummy file to represent results from kallisto files = ['example_file' + k] for f in files: with open(f, "w") as file: file.write(" 0 1 0 " + k) #Now attach file to collection err = dataset.attach(collection, k, files) if err != '': print(err) exit() #Cleanup local disk for f in files: os.remove(f)
) parser.add_argument("-recid", help="Eprints recid") parser.add_argument("-start_recid", help="Eprints recid to start at") parser.add_argument( "-test", help= "Uses feeds data and writes report of what would be changed, but makes no changes. Provide output file name", ) parser.add_argument("-username", help="Eprints username") parser.add_argument("-password", help="Eprints password") args = parser.parse_args() if args.test: source = get_caltechfeed(args.repository) keys = dataset.keys(source) fout = open("../" + args.test, "w", newline="\n", encoding="utf-8-sig") file_out = csv.writer(fout) else: if args.repository in ["authors", "thesis", "caltechcampuspubs"]: source = "https://" else: source = "http://" if args.username: source = source + args.username + ":" + args.password + "@" source = source + args.repository + ".library.caltech.edu" keys = get_eprint_keys(source) file_out = None if args.start_recid: keys = [k for k in keys if int(k) >= int(args.start_recid)] if args.update_type == "resolver":
def test_attachments(t, collection_name): t.print("Testing attach, attachments, detach and prune") # Generate two files to attach. with open('a1.txt', 'w') as text_file: text_file.write('This is file a1') with open('a2.txt', 'w') as text_file: text_file.write('This is file a2') filenames = ['a1.txt', 'a2.txt'] if dataset.status(collection_name) == False: t.error("Failed,", collection_name, "missing") return keys = dataset.keys(collection_name) if len(keys) < 1: t.error("Failed,", collection_name, "should have keys") return key = keys[0] if dataset.attach(collection_name, key, filenames) == False: err = dataset.error_message() t.error("Failed, to attach files for", collection_name, key, filenames, ', ', err) return l = dataset.attachments(collection_name, key) if len(l) != 2: t.error("Failed, expected two attachments for", collection_name, key, "got", l) return #Check that attachments arn't impacted by update if dataset.update(collection_name, key, {"testing": "update"}) == False: err = dataset.error_message() t.error("Failed, to update record", collection_name, key, err) return l = dataset.attachments(collection_name, key) if len(l) != 2: t.error("Failed, expected two attachments after update for", collection_name, key, "got", l) return if os.path.exists(filenames[0]): os.remove(filenames[0]) if os.path.exists(filenames[1]): os.remove(filenames[1]) # First try detaching one file. if dataset.detach(collection_name, key, [filenames[1]]) == False: err = dataset.error_message() t.error("Failed, expected True for", collection_name, key, filenames[1], ', ', err) if os.path.exists(filenames[1]): os.remove(filenames[1]) else: t.error("Failed to detch", filenames[1], "from", collection_name, key) # Test explicit filenames detch if dataset.detach(collection_name, key, filenames) == False: err = dataset.error_message() t.error("Failed, expected True for", collection_name, key, filenames, ', ', err) for fname in filenames: if os.path.exists(fname): os.remove(fname) else: t.error("Failed, expected", fname, "to be detached from", collection_name, key) # Test detaching all files if dataset.detach(collection_name, key, []) == False: err = dataset.error_message() t.error("Failed, expected True for (detaching all)", collection_name, key, ', ', err) for fname in filenames: if os.path.exists(fname): os.remove(fname) else: t.error("Failed, expected", fname, "for detaching all from", collection_name, key) if dataset.prune(collection_name, key, [filenames[0]]) == False: err = dataset.error_messag() t.error("Failed, expected True for prune", collection_name, key, [filenames[0]], ', ', err) l = dataset.attachments(collection_name, key) if len(l) != 1: t.error("Failed, expected one file after prune for", collection_name, key, [filenames[0]], "got", l) if dataset.prune(collection_name, key, []) == False: err = dataset.error_message() t.error("Failed, expected True for prune (all)", collection_name, key, ', ', err) l = dataset.attachments(collection_name, key) if len(l) != 0: t.error("Failed, expected zero files after prune for", collection_name, key, "got", l)
args = parser.parse_args() name = args.data_collection[0] sheet = args.input_sheet[0] output_sheet = args.output_sheet[0] import_coll = "imported.ds" os.system("rm -rf imported.ds") dataset.init(import_coll) os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json" err = dataset.import_gsheet(import_coll, sheet, 'Sheet1', 1, 'A:CZ') if err != '': print(err) keys = dataset.keys(import_coll) coauthors = [] count = 0 for key in progressbar(keys, redirect_stdout=True): record, err = dataset.read(name, key) if err != "": print(err) count = 0 if 'identifiers' in record: identifiers = record['identifiers'] else: identifiers = [] print(key) print(record)
def test_sync_csv(t, c_name): # Setup test collection if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}) failed, {err}') return # Setup test CSV instance t_data = [{ "_Key": "one", "value": 1 }, { "_Key": "two", "value": 2 }, { "_Key": "three", "value": 3 }] csv_name = c_name.strip(".ds") + ".csv" if os.path.exists(csv_name): os.remove(csv_name) with open(csv_name, 'w') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"]) csv_writer.writeheader() for obj in t_data: csv_writer.writerow(obj) # Import CSV into collection if dataset.import_csv(c_name, csv_name, True) == False: err = dataset.error_message() t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}') return for key in ["one", "two", "three"]: if dataset.has_key(c_name, key) == False: t.error(f"expected has_key({key}) == True, got False") if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key('five') == False, got True") if dataset.create(c_name, "five", {"value": 5}) == False: err = dataset.error_message() t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}') return # Setup frame frame_name = 'test_sync' keys = dataset.keys(c_name) if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"], ["_Key", "value"]) == False: err = dataset.error_message() t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}') return #NOTE: Tests for sync_send_csv and sync_receive_csv if dataset.sync_send_csv(c_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}') return with open(csv_name) as fp: src = fp.read() if 'five' not in src: t.error(f"expected 'five' in src, got {src}") # Now remove "five" from collection if dataset.delete(c_name, "five") == False: err = dataset.error_message() t.error(f'delete({c_name}, "five") failed, {err}') return if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key(five) == False, got True") return if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False: err = dataset.error_message() t.error( f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}' ) return if dataset.has_key(c_name, "five") == False: t.error(f"expected has_key(five) == True, got False") return
def update_datacite_media(username, password, collection, prefix): keys = dataset.keys(collection) if path.exists("mediaupdate"): with open("mediaupdate", "r") as infile: update = date.fromisoformat(infile.read()) else: # Arbitrary old date - everything will be updated update = date(2011, 1, 1) for k in progressbar(keys, redirect_stdout=True): existing, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") atlas = False subjects = existing["subjects"] for subject in subjects: if (subject["subject"].strip() == "Atlas of Bacterial and Archaeal Cell Structure"): atlas = True record_update = datetime.fromisoformat(existing["updated"]).date() # Subtraction to get window to grab records that were updated between runs if record_update > update - timedelta(days=2): if "electronic_location_and_access" in existing: doi = existing["identifier"]["identifier"] record_prefix = doi.split("/")[0] if record_prefix == prefix: delete_datacite_media(username, password, doi) for file_met in existing["electronic_location_and_access"]: url = "https://mds.datacite.org/media/" + doi headers = { "Content-Type": "application/txt;charset=UTF-8" } extension = file_met["electronic_name"][0].split( ".")[-1] filename = file_met["electronic_name"][0].split(".")[0] data = {} if extension == "nc": data = ("application/x-netcdf=" + file_met["uniform_resource_identifier"]) elif extension == "mp4": if atlas: data = ( "video/mp4=" + "https://www.cellstructureatlas.org/videos/" + filename + ".mp4") else: data = ( "video/mp4=" + file_met["uniform_resource_identifier"]) elif extension == "mj2": data = ("video/mj2=" + file_met["uniform_resource_identifier"]) elif extension == "avi": data = ("video/avi=" + file_met["uniform_resource_identifier"]) elif extension == "mov": data = ("video/quicktime=" + file_met["uniform_resource_identifier"]) elif extension == "gz": data = ("application/gzip=" + file_met["uniform_resource_identifier"]) elif extension == "zip": data = ("application/zip=" + file_met["uniform_resource_identifier"]) elif extension == "h5ad": data = ("application/octet-stream=" + file_met["uniform_resource_identifier"]) if data != {}: print(doi) print(data) r = requests.post( url, data=data.encode("utf-8"), auth=(username, password), headers=headers, ) print(r)
description="caltechdata_backup queries the caltechDATA (Invenio 3) API\ returns data and adds to dataset structure on disk") collection = "caltechdata.ds" if os.path.isdir(collection) == False: err = dataset.init(collection) if err != "": print(f"Failed on create {err}") exit() args = parser.parse_args() api_url = "https://data.caltech.edu/api/records/" # Get the existing records current = dataset.keys(collection) req = requests.get(api_url) data = req.json() temp = 'temp' if os.path.isdir(temp) == False: os.mkdir(temp) os.chdir(temp) collection = '../' + collection read_records(data["hits"]["hits"], current, collection) # if we have more pages of data while "next" in data["links"]: req = requests.get(data["links"]["next"]) data = req.json()
def add_usage(collection, token, usage_collection, production=True): """Add in usage text in the description field""" keys = dataset.keys(collection) biggest_views = 0 biggest_views_record = "" biggest_downloads = 0 biggest_downloads_record = "" total_views = 0 total_downloads = 0 for k in keys: record, err = dataset.read(collection, k) if err != "": print(err) exit() usage, err = dataset.read(usage_collection, k) views = usage["grand-total-unique-investigations"] downloads = usage["grand-total-unique-requests"] if views > biggest_views: biggest_views = views biggest_views_record = k if downloads > biggest_downloads: biggest_downloads = downloads biggest_downloads_record = k total_views += views total_downloads += downloads date = datetime.fromisoformat(usage["dataset-dates"][0]["value"]) now = datetime.today() first = date.strftime("%B %d, %Y") last = now.strftime("%B %d, %Y") if views > 1: u_txt = ( "<br>Unique Views: " + str(views) + "<br>Unique Downloads: " + str(downloads) + "<br> between " + first + " and " + last + '<br><a href="https://data.caltech.edu/stats"' + ">More info on how stats are collected</a><br>" ) description = record["descriptions"] use_exists = False for d in description: descr_text = d["description"] # We always update an existing listing if descr_text.startswith("<br>Unique Views:"): d["description"] = u_txt use_exists = True # Otherwise we add a new one if use_exists == False: description.append({"descriptionType": "Other", "description": u_txt}) response = caltechdata_edit( token, k, {"descriptions": description}, {}, {}, production ) print(response) print(f"Most downloads {biggest_downloads} for record {biggest_downloads_record}") print(f"Most views {biggest_views} for record {biggest_views_record}") print(f"Total downloads {total_downloads}") print(f"Total views {total_views}")
def test_keys(t, collection_name): '''test_keys(collection_name) test getting, filter and sorting keys''' # Test count after delete key_list = dataset.keys(collection_name) cnt = dataset.count(collection_name) if cnt != 0: t.error("Failed, expected zero records, got", cnt, key_list) # # Generate multiple records for collection for testing keys # test_records = { "gutenberg:21489": { "title": "The Secret of the Island", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "http://www.gutenberg.org/ebooks/21489", "categories": "fiction, novel" }, "gutenberg:2488": { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488", "categories": "fiction, novel" }, "gutenberg:21839": { "title": "Sense and Sensibility", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jane", "family": "Austin" }], "url": "http://www.gutenberg.org/ebooks/21839", "categories": "fiction, novel" }, "gutenberg:3186": { "title": "The Mysterious Stranger, and Other Stories", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Mark", "family": "Twain" }], "url": "http://www.gutenberg.org/ebooks/3186", "categories": "fiction, short story" }, "hathi:uc1321060001561131": { "title": "A year of American travel - Narrative of personal experience", "formats": ["pdf"], "authors": [{ "given": "Jessie Benton", "family": "Fremont" }], "url": "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9", "categories": "non-fiction, memoir" } } test_count = len(test_records) for k in test_records: v = test_records[k] if dataset.create(collection_name, k, v) == False: err = dataset.error_message() t.error("Failed, could not add", k, "to", collection_name, ', ', err) # Test keys, filtering keys and sorting keys all_keys = dataset.keys(collection_name) if len(all_keys) != test_count: t.error("Expected", test_count, "all_keys back, got", keys) #dataset.verbose_on() filter_expr = '(eq .categories "non-fiction, memoir")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 1: t.error( f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for", filter_expr, "got", filtered_keys) filter_expr = '(contains .categories "novel")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 3: t.error( f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for", filter_expr, "got", filtered_keys) sort_expr = '+.title' filter_expr = '(contains .categories "novel")' sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr) if len(sorted_keys) != 3: t.error( f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for", filter_expr, "got", sorted_keys) expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"] for i, k in enumerate(expected_keys): if i < len(sorted_keys) and sorted_keys[i] != k: obj1, _ = dataset.read(collection_name, k) obj2, _ = dataset.read(collection_name, sorted_keys[i]) t.error( f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")' )
def fix_multiple_links(input_collection, token): keys = dataset.keys(input_collection) for k in keys: record, err = dataset.read(input_collection, k) if err != "": print(err) exit() if "relatedIdentifiers" in record: idvs = [] new = [] dupes = [] replace = False record_doi = record["identifier"]["identifier"] for idv in record["relatedIdentifiers"]: idvs.append(idv["relatedIdentifier"]) for idv in record["relatedIdentifiers"]: identifier = idv["relatedIdentifier"] if identifier == record_doi: # Having a related identifier that is the same as the record # doi doesn't make any sense replace = True dupes.append(identifier) else: count = idvs.count(identifier) if count > 1: replace = True if identifier not in dupes: # We need to save the first duplicate new.append(idv) # Add to list of those already saved dupes.append(identifier) else: # This will be deleted dupes.append(identifier) else: # Save all unique ids new.append(idv) if replace == True: print("Duplicate links found in record ", k) print("Will delete these links", dupes) response = input("Do you approve this change? Y or N") new_metadata = {"relatedIdentifiers": new} if response == "Y": response = caltechdata_edit(token, k, new_metadata, {}, {}, True) print(response) if "alternateIdentifiers" in record: idtypes = [] alt_ids = [] repeat = False for idv in record["alternateIdentifiers"]: if idv["alternateIdentifierType"] not in idtypes: # If we haven't seen id type before, save it alt_ids.append(idv) idtypes.append(idv["alternateIdentifierType"]) else: repeat = True print("Will Delete Repeated ID ", idv["alternateIdentifier"]) if repeat == True: new_metadata = {"alternateIdentifiers": alt_ids} response = caltechdata_edit(token, k, new_metadata, {}, {}, True) print(response)
def match_cd_refs(): token = os.environ["TINDTOK"] matches = [] collection = "caltechdata.ds" keys = dataset.keys(collection) if "mediaupdate" in keys: keys.remove("mediaupdate") # Get event data results event_data = "crossref_refs.ds" event_keys = dataset.keys(event_data) event_keys.remove("captured") f_name = "match_cd_refs" dot_paths = [".obj_id", ".id", ".subj_id"] labels = ["obj_id", "id", "subj_id"] print("Getting Event Data Records") if dataset.has_frame(event_data, f_name): if not dataset.frame_reframe(event_data, f_name, event_keys): err = dataset.error_message() print(f"Failed to reframe {f_name} in {event_data}, {err}") exit() elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): err = dataset.error_message() print(f"Failed to create frame {f_name} in {event_data}, {err}") exit() grid = dataset.frame_grid(event_data, f_name) df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) grouped = df.groupby(["obj_id"]) groups = grouped.groups # Look at all CaltechDATA records for k in keys: # Collect matched new links for the record record_matches = [] print(k) metadata, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") doi = "https://doi.org/" + metadata["identifier"]["identifier"] if doi in groups: hits = grouped.get_group(doi) for index, h in hits.iterrows(): # Trigger for whether we already have this link new = True if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: if m["relatedIdentifier"] in h["subj_id"]: new = False if new == True: match = h["subj_id"] print(match) print(h["obj_id"]) inputv = input("Do you approve this link? Type Y or N: ") if inputv == "Y": record_matches.append(match) # If we have to update record if len(record_matches) > 0: ids = [] if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: ids.append(m) matches.append([k, record_matches]) # Now collect identifiers for record for match in record_matches: split = match.split("doi.org/") new_id = { "relatedIdentifier": split[1], "relatedIdentifierType": "DOI", "relationType": "IsCitedBy", } ids.append(new_id) newmetadata = {"relatedIdentifiers": ids} response = caltechdata_edit(token, k, newmetadata, {}, {}, True) print(response) return matches
import os from py_dataset import dataset from ames.harvesters import get_caltechfeed, get_records if __name__ == "__main__": import_coll = "imported.ds" sheet = "1ZI3-XvQ_3rLcKrF-4FBa2tEInIdQfOnGJ9L_NmhmoGs" os.system("rm -rf imported.ds") dataset.init(import_coll) err = dataset.import_gsheet(import_coll, sheet, "CaltechPEOPLE", 4, "A:AA") if err != "": print(err) people_list = dataset.keys(import_coll) people = [] for p in people_list: record, err = dataset.read(import_coll, p) people.append(record) # Profiles collection from feeds profile_ds = "profiles.ds" keys = dataset.keys(profile_ds) labels = ["orcid", "creator_id"] dot_paths = [".orcid", ".creator_id"] all_metadata = get_records(dot_paths, "profile", profile_ds, keys, labels) for profile in all_metadata: if "creator_id" in profile: idv = profile["creator_id"] else:
def agent_report(file_name, repo, aspace): dot_paths = [ "._Key", ".directory_info", ".ORCID", ".sort_name", ".ArchivesSpace_ID", ".family", ".given", ] labels = ["id", "directory_info", "orcid", "name", "as", "family", "given"] source = get_caltechfeed("people") keys = dataset.keys(source) keys.remove("captured") all_metadata = get_records(dot_paths, "p_list", source, keys, labels) all_metadata.sort(key=lambda all_metadata: all_metadata["id"]) fname = file_name.split(".")[0] fcaltechpeople = fname + "_caltechpeople.csv" fmatched = fname + "_matched.csv" fnew_caltechpeople = fname + "_newcaltechpeople.csv" fnew_aspace = fname + "_newaspace.csv" caltechpeople = csv.writer(open(fcaltechpeople, "w")) matched = csv.writer(open(fmatched, "w")) new_caltechpeople = csv.writer(open(fnew_caltechpeople, "w")) new_aspace = csv.writer(open(fnew_aspace, "w")) to_match = {} gen_match = {} already_matched = {} aspace_url = "https://collections.archives.caltech.edu/agents/people/" feeds_url = "https://feeds.library.caltech.edu/people/" for metadata in all_metadata: if "as" in metadata: if metadata["as"] != "": already_matched[metadata["as"]] = metadata else: to_match[metadata["name"]] = metadata gen_match[metadata["family"]] = metadata print(f"{len(already_matched)} agents already in CaltechPEOPLE") print(f"Requesting agents") for agent in progressbar(aspace.agents): if agent.agent_type == "agent_person": primaty_name = agent.display_name.primary_name name = agent.display_name.sort_name published = agent.publish uid = int(agent.uri.split("/")[-1]) if uid not in already_matched: if name in to_match: person = to_match[name] matched.writerow([ person["name"], uid, aspace_url + str(uid), person["id"], feeds_url + person["id"], published, ]) to_match.pop(name) else: new_caltechpeople.writerow( [name, uid, aspace_url + str(uid), published]) else: metadata = already_matched[uid] caltechpeople.writerow([ metadata["name"], metadata["as"], aspace_url + str(metadata["as"]), metadata["id"], feeds_url + metadata["id"], published, ]) for name in to_match: new_aspace.writerow( [name, to_match[name]["id"], feeds_url + to_match[name]["id"]])
def test_issue43(t, collection_name, csv_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) if os.path.exists(csv_name): os.remove(csv_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'Failed, need a {collection_name} to run test, {err}') return table = { "r1": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" }, "r2": { "c1": "one", "c3": "three", "c4": "four" }, "r3": { "c1": "one", "c2": "two", "c4": "four" }, "r4": { "c1": "one", "c2": "two", "c3": "three" }, "r5": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" } } for key in table: row = table[key] if dataset.create(collection_name, key, row) == False: err = dataset.error_message() t.error(f"Can't add test row {key} to {collection_name}, {err}") return dataset.use_strict_dotpath(False) # Setup frame frame_name = 'f1' keys = dataset.keys(collection_name) if dataset.frame_create(collection_name, frame_name, keys, ["._Key", ".c1", ".c2", ".c3", ".c4"], ["_Key", "c1", "c2", "c3", "c4"]) == False: err = dataset.error_message() t.error(err) return if dataset.export_csv(collection_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}' ) return with open(csv_name, mode='r', encoding='utf-8') as f: rows = f.read() for row in rows.split('\n'): if len(row) > 0: cells = row.split(',') if len(cells) < 5: t.error(f'row error {csv_name} for {cells}')
#!/usr/bin/env python3 import sys from datetime import datetime from py_dataset import dataset # # Loop through the keys, fetch the record and append a _Key: "deposit" to # each object. # c_name = "people.ds" keys = dataset.keys(c_name) #print(f"DEBUG Keys: {keys}") for key in keys: print(f"Fixing key {key}") data, err = dataset.read(c_name, key) if err != "": print(f"Error read {c_name} -> {key}, {err}") sys.exit(1) # Make fieldname lower case dt = datetime.now().strftime('%Y-%m-%d %H:%I:%S') obj = { "_Key": key, "_State": "deposit", "_Updated": f"{dt}", "_Created": f"{dt}" } for field in data: fkey = field.lower() if not ' ' in fkey:
os.mkdir("data") os.chdir("data") production = True collection = "caltechdata.ds" files = True if files: get_caltechdata(collection, production) mapping = file_mapping(collection) history = False if history: keys = dataset.keys(collection) h_collection = "caltechdata_history.ds" get_history(h_collection, collection, keys) mapping = file_mapping(h_collection) update = True usage_collection = "caltechdata_usage.ds" if update: token = os.environ["MATTOK"] build_usage(collection, usage_collection) get_usage(usage_collection, mapping, token) token = os.environ["TINDTOK"] add_usage(collection, token, usage_collection, production) aggregate = True
def add_thesis_doi(data_collection, thesis_collection, token, production=True): """Add in theis DOI to CaltechDATA records""" # Search across CaltechTHESIS DOIs dot_paths = ["._Key", ".doi", ".official_url", ".related_url"] labels = ["eprint_id", "doi", "official_url", "related_url"] keys = dataset.keys(thesis_collection) all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels) dois = [] for metadata in progressbar(all_metadata, redirect_stdout=True): if "doi" in metadata: record_doi = metadata["doi"].strip() if "related_url" in metadata and "items" in metadata["related_url"]: items = metadata["related_url"]["items"] for item in items: if "url" in item: url = item["url"].strip() if "type" in item: itype = item["type"].strip().lower() if itype == "doi": if idutils.is_doi(url): doi = "10." + url.split("10.")[1] prefix = doi.split("/")[0] if prefix == "10.22002": dois.append([doi, record_doi]) else: print("Ignoring non-DOI") print(metadata["eprint_id"]) print(url.split("10.")) for doi_link in dois: cd_doi = doi_link[0] thesis_doi = doi_link[1] print("Checking " + cd_doi) if "D1" in cd_doi: record_number = cd_doi.split("D1.")[1] if "d1" in cd_doi: record_number = cd_doi.split("d1.")[1] record, err = dataset.read(data_collection, record_number) if err != "": print(err) exit() done = False if "relatedIdentifiers" in record: for idv in record["relatedIdentifiers"]: identifier = idv["relatedIdentifier"] if identifier == thesis_doi: done = True if done == False: identifiers = record["relatedIdentifiers"] identifiers.append( { "relatedIdentifier": thesis_doi, "relatedIdentifierType": "DOI", "relationType": "IsSupplementTo", } ) new_metadata = {"relatedIdentifiers": identifiers} else: new_metadata = { "relatedIdentifiers": [ { "relatedIdentifier": thesis_doi, "relatedIdentifierType": "DOI", "relationType": "IsSupplementTo", } ] } if done == False: print("Adding " + thesis_doi + " to " + cd_doi) response = caltechdata_edit( token, record_number, new_metadata, {}, {}, True ) print(response)
def test_basic(t, collection_name): '''test_basic(collection_name) runs tests on basic CRUD ops''' # Setup a test record key = "2488" value = { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488" } # We should have an empty collection, we will create our test record. if dataset.create(collection_name, key, value) == False: err = dataset.error_message() t.error(f'create({collection_name}, {key}, {value}) failed, {err}') return # Check to see that we have only one record key_count = dataset.count(collection_name) if key_count != 1: t.error(f"Failed, expected count to be 1, got {key_count}") # Do a minimal test to see if the record looks like it has content keyList = dataset.keys(collection_name) rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error(f"epxected {rec[k]} got {v}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error(f"Failed, expected {k} with list v, got {v}") # Test updating record value["verified"] = True if dataset.update(collection_name, key, value) == False: err = dataset.error_message() t.error(f"update({collection_name}, {key}, {value}) failed, {err}") rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error("expected {rec[k]} got {v} for key {k}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error("Failed, expected {k} with a list for v, got {v}") # Test path to record expected_s = "/".join( [collection_name, "pairtree", "24", "88", (key + ".json")]) expected_l = len(expected_s) p = dataset.path(collection_name, key) if len(p) != expected_l: t.error("Failed, expected length", expected_l, "got", len(p)) if p != expected_s: t.error("Failed, expected", expected_s, "got", p) # Test listing records l = dataset.list(collection_name, [key]) if len(l) != 1: t.error( f"list({collection_name}, [{key}]) failed, list should return an array of one record, got", l) return # test deleting a record if dataset.delete(collection_name, key) == False: err = dataset.error_message() t.error("Failed, could not delete record", key, ", ", err)
def release_files(source, base_url, outfile=None): if source.split(".")[-1] == "ds": # This generates report dot_paths = [ ".eprint_id", ".documents", ".date", ".eprint_status", ".creators.items[0].name.family", ".thesis_type", ".full_text_status", ] labels = [ "eprint_id", "documents", "date", "status", "family", "type", "full_text", ] keys = dataset.keys(source) all_metadata = get_records(dot_paths, "official", source, keys, labels) all_metadata.sort(key=lambda all_metadata: all_metadata["family"]) all_metadata.sort(key=lambda all_metadata: all_metadata["date"]) for meta in all_metadata: year = meta["date"].split("-")[0] if is_in_range("2004-2005", year): if thesis_match(meta): files = [] fnames = [] count = 0 for document in meta["documents"]: count = count + 1 if document["security"] == "validuser": files.append(count) fnames.append(document["main"]) if len(files) > 0: eprint_id = meta["eprint_id"] print(eprint_id) outfile.writerow( [ year, meta["family"], eprint_id, meta["status"], meta["full_text"], files, fnames, ] ) mixed = False for filen in files: new = "public" # Doc status url = ( base_url + "/rest/eprint/" + str(eprint_id) + "/full_text_status.txt" ) response = requests.get(url) eprint_status = response.text if eprint_status == "restricted": response = requests.put(url, data=new, headers=headers) print(response) elif eprint_status == "mixed": print("mixed, skipping") mixed = True elif eprint_status != "public": print(eprint_status) print(url) exit() url = ( base_url + "/rest/eprint/" + str(eprint_id) + "/documents/" + str(filen) + "/security.txt" ) headers = {"content-type": "text/plain"} response = requests.get(url) live_status = response.text if not mixed: if live_status == "validuser": response = requests.put( url, data=new, headers=headers ) print(response) elif live_status != "public": print(live_status) print(url) exit()
os.remove(tarball) sys.stdout.flush() if len(sys.argv) == 1: app = os.path.basename(sys.argv[0]) print(f"USAGE: {app} DATASET_NAME", end="\n\n") print( "Converts attachments in a dataset from tarballs to v0.0.62 attachment scheme", end="\n\n") sys.exit(0) if not os.path.exists("tmp-attachment-migration"): os.mkdir("tmp-attachment-migration") os.chdir("tmp-attachment-migration") print(f"Working directory for migration is {os.getcwd()}") for c_name in sys.argv: keys = dataset.keys(os.path.join("..", c_name)) if isinstance(keys[0], int): keys.sort(key=int) else: keys.sort() tot = len(keys) print(f"Ready to process {tot} objects") for i, key in enumerate(keys): if (i > 0) and (i % 500) == 0: print(f"\n{i} of {tot} processed") migrate_attachment(os.path.join("..", c_name), key) print() print(f"Procssing {c_name} complete")
def aggregate_usage(usage_collection, month_collection): keys = dataset.keys(usage_collection) keys.remove("end-date") for k in progressbar(keys): record, err = dataset.read(usage_collection, k) if err != "": print(err) use = {} views = {} for usage in record["performance"]: split = usage["period"].split("-") month = split[0] + "-" + split[1] for u in usage["instance"]: metric = u["metric-type"] if metric == "unique-dataset-requests": if month in use: use[month] += u["count"] else: use[month] = u["count"] if metric == "unique-dataset-investigations": if month in views: views[month] += u["count"] else: views[month] = u["count"] # Strip non-counter stuff record.pop("_Key") record.pop("grand-total-unique-requests") record.pop("grand-total-unique-investigations") # go across months for view in views: split = view.split("-") date_obj = datetime(int(split[0]), int(split[1]), 1) d_range = get_month_day_range(date_obj) performance = [ { "period": { "begin-date": d_range[0].date().isoformat(), "end-date": d_range[1].date().isoformat(), }, "instance": [], } ] v = views[view] performance[0]["instance"].append( { "count": v, "metric-type": "unique-dataset-investigations", "access-method": "regular", } ) # Handle when we have both views and uses in a given month if view in use: u = use[view] performance[0]["instance"].append( { "count": u, "metric-type": "unique-dataset-requests", "access-method": "regular", } ) existing, err = dataset.read(month_collection, view) if err != "": print(err) record["performance"] = performance existing["report-datasets"].append(record) if not dataset.update(month_collection, view, existing): err = dataset.error_message() print(err) for use_date in use: # We only have use-only records left to handle if use_date not in views: u = use[use_date] split = use_date.split("-") date_obj = datetime(int(split[0]), int(split[1]), 1) d_range = get_month_day_range(date_obj) performance = [ { "period": { "begin-date": d_range[0].date().isoformat(), "end-date": d_range[1].date().isoformat(), }, "instance": [ { "count": u, "metric-type": "unique-dataset-requests", "access-method": "regular", } ], } ] existing, err = dataset.read(month_collection, view) if err != "": print(err) record["performance"] = performance existing["report-datasets"].append(record) if not dataset.update(month_collection, view, existing): err = dataset.error_message() print(err)