def build_usage(caltechdata_collection, usage_collection): """Build collection of records that contain CaltechDATA usage information""" if not os.path.isdir(usage_collection): if not dataset.init(usage_collection): print("Dataset failed to init collection") exit() # Write date to start collecting statistics for new collection dataset.create(usage_collection, "end-date", {"end-date": 1485907200}) # Build out structure for all CaltechDATA records ids = dataset.keys(caltechdata_collection) for k in ids: if dataset.has_key(usage_collection, k) == False: metadata, err = dataset.read(caltechdata_collection, k) # When record was submitted to CaltechDATA: rdate = None submitted = None issued = None if "dates" in metadata: doi = metadata["identifier"]["identifier"] for date in metadata["dates"]: if date["dateType"] == "Submitted": rdate = date["date"] if date["dateType"] == "Updated": submitted = date["date"] if date["dateType"] == "Issued": issued = date["date"] if rdate == None: if submitted != None: rdate = submitted else: rdate = issued else: # Dummy values for junk records rdate = "2020-04-01" doi = "" # Dataset is the only supported type in the spec and we are # following the dataset standards for usage # All dates are the date added to CaltechDATA, which is # the apropriate 'publication' date even if content was available # earlier record_data = { "dataset-id": [{"type": "doi", "value": doi}], "uri": "https://data.caltech.edu/records/" + k, "publisher": "CaltechDATA", "platform": "CaltechDATA", "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}], "yop": rdate.split("-")[0], "data-type": "dataset", "dataset-dates": [{"type": "pub-date", "value": rdate}], "dataset-title": metadata["titles"][0]["title"], "performance": [], "grand-total-unique-investigations": 0, "grand-total-unique-requests": 0, } if not dataset.create(usage_collection, k, record_data): err = dataset.error_message() print(err) exit()
def test_issue12(t, c_name): src = '''[ {"id": "1", "c1": 1, "c2": 2, "c3": 3 }, {"id": "2", "c1": 2, "c2": 2, "c3": 3 }, {"id": "3", "c1": 3, "c2": 3, "c3": 3 }, {"id": "4", "c1": 1, "c2": 1, "c3": 1 }, {"id": "5", "c1": 6, "c2": 6, "c3": 6 } ]''' #dataset.verbose_on() # DEBUG #dataset.use_strict_dotpath(True) # DEBUG if dataset.status(c_name) == False: if not dataset.init(c_name): err = dataset.error_message() t.error(f'failed to create {c_name}') return objects = json.loads(src) for obj in objects: key = obj['id'] if dataset.has_key(c_name, key): dataset.update(c_name, key, obj) else: dataset.create(c_name, key, obj) f_names = dataset.frames(c_name) for f_name in f_names: ok = dataset.delete_frame(c_name, f_name) if ok == False: err = dataset.error_message() t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"') return if dataset.has_frame(c_name, f_name) == True: t.error( f'Failed to delete frame {c_name} from {c_name}, frame still exists' ) return f_name = 'issue12' dot_paths = [".c1", "c3"] labels = [".col1", ".col3"] keys = dataset.keys(c_name) if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels): err = dataset.error_message() t.error(f'failed to create {f_name} from {c_name}, {err}') if not dataset.has_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected frame {f_name} to exists, {err}') return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) == 0: err = dataset.error_message() t.error(f'expected keys in {f_name}, got zero, {err}') return f_objects = dataset.frame_objects(c_name, f_name) if len(f_objects) == 0: err = dataset.error_message() t.error(f'expected objects in {f_name}, got zero, {err}') return if not dataset.delete_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected to delete {f_name} in {c_name}, {err}')
def get_caltechdata(collection, production=True, datacite=False): """Harvest all records from CaltechDATA . Always creates collection from scratch""" # Delete existing collection if os.path.isdir(collection): shutil.rmtree(collection) if not dataset.init(collection): print("Dataset failed to init collection") exit() if production == True: url = "https://data.caltech.edu/api/records" else: url = "https://cd-sandbox.tind.io/api/records" response = requests.get(url + "/?size=9000") hits = response.json() print(hits) for h in progressbar(hits["hits"]["hits"]): rid = str(h["id"]) # Get enriched metadata records (including files) if datacite == False: metadata = decustomize_schema(h["metadata"], True, True, True) metadata["updated"] = h["updated"] else: # Get just DataCite metadata metadata = decustomize_schema(h["metadata"]) if not dataset.create(collection, rid, metadata): err = dataset.error_message() print(err)
def test_frame(t, c_name): if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(err) return data = [{ "id": "A", "one": "one", "two": 22, "three": 3.0, "four": ["one", "two", "three"] }, { "id": "B", "two": 2000, "three": 3000.1 }, { "id": "C" }, { "id": "D", "one": "ONE", "two": 20, "three": 334.1, "four": [] }] keys = [] dot_paths = ["._Key", ".one", ".two", ".three", ".four"] labels = ["_Key", "one", "two", "three", "four"] for row in data: key = row['id'] keys.append(key) dataset.create(c_name, key, row) f_name = 'f1' if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False: err = dataset.error_message() t.error(err) if dataset.frame_reframe(c_name, f_name) == False: err = dataset.error_message() t.error(err) l = dataset.frames(c_name) if len(l) != 1 or l[0] != 'f1': t.error(f"expected one frame name, f1, got {l}") if dataset.delete_frame(c_name, f_name) == False: err = dataset.error_message() t.error(f'delete_frame({c_name}, {f_name}), {err}')
def write_records(records, collection): for r in records: key = r["UID"] if dataset.has_key(collection, key) == False: print(key) err = dataset.create(collection, key, r) if err != "": print(f"Unexpected error on create: {err}")
def test_issue32(t, collection_name): if dataset.create(collection_name, "k1", {"one": 1}) == False: err = dataset.error_message() t.error("Failed to create k1 in", collection_name, ', ', err) return if dataset.has_key(collection_name, "k1") == False: t.error("Failed, has_key k1 should return", True) if dataset.has_key(collection_name, "k2") == True: t.error("Failed, has_key k2 should return", False)
def test_check_repair(t, collection_name): t.print("Testing status on", collection_name) # Make sure we have a left over collection to check and repair if os.path.exists(collection_name) == True: shutil.rmtree(collection_name) if dataset.status(collection_name) == True: dataset.close(collection_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'init({collection_name}) failed, {err}') return if dataset.status(collection_name) == False: t.error( f"Failed, expected dataset.status() == True, got False for {collection_name}" ) return if dataset.has_key(collection_name, 'one') == False: if dataset.create(collection_name, 'one', {"one": 1}) == False: err = dataset.error_message() t.error( f'create({collection_name}, "one", {"one": 1}) failed, {err}') t.print(f"Testing check on {collection_name}") # Check our collection if not (dataset.check(collection_name) == True): err = dataset.error_message() t.error( "Failed, (before break) expected check True, got False for {collection_name} (err: {err})" ) return # Break and recheck our collection print(f"Removing {collection_name}/collection.json to cause a fail") if os.path.exists(collection_name + "/collection.json"): os.remove(collection_name + "/collection.json") print(f"Testing check on (broken) {collection_name}") if not (dataset.check(collection_name) == False): err = dataset.error_message() t.error( f"Failed, (after break) expected check False got True for {collection_name} (err: {err})" ) else: t.print(f"Should have see error output for broken {collection_name}") # Repair our collection t.print("Testing repair on", collection_name) if dataset.repair(collection_name) == False: err = dataset.error_message() t.error("Failed, expected repair to return True, got, ", err) if os.path.exists(os.path.join(collection_name, "collection.json")) == False: t.error( f"Failed, expected recreated {collection_name}/collection.json")
def make_link_history(collection, resolver, url, note): """Make an entry in our link history collection""" now = datetime.today().isoformat() # Run checks on both resoler and final URL try: target = requests.get(url) except requests.exceptions.ConnectionError: target = requests.Response() target.status_code = 404 target.url = '' if target.status_code != 200: print( f"Target URL {url} returns Error status code {target.status_code}") if links_differ(target.url, url): print(f"Target URL '{url}' redirects to '{target.url}'") try: get = requests.get(f"https://resolver.library.caltech.edu/{resolver}") except requests.exceptions.ConnectionError: get = requests.Response() get.status_code = 404 get.url = '' if links_differ(get.url, url): print(f"Mismatch between expected url '{url}' and actual '{get.url}'") if get.status_code != 200: print( f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}" ) entry = { "expected-url": url, "url": get.url, "modified": now, "code": get.status_code, "note": note, } # If existing, push into history if dataset.has_key(collection, resolver): existing, err = dataset.read(collection, resolver) if err != "": print(err) exit() if save_history(existing, url, get): past_history = existing.pop("history") past_history.append(existing) entry["history"] = past_history if not dataset.update(collection, resolver, entry): print(dataset.error_message()) exit() else: entry["history"] = [] if not dataset.create(collection, resolver, entry): print(dataset.error_message()) exit()
def get_history(collection, caltechdata_collection, caltechdata_keys): """Harvest the history of records from CaltechDATA.""" keys_to_update = [] if os.path.exists("historyupdate"): with open("historyupdate", "r") as infile: update = date.fromisoformat(infile.read()) else: # Arbitrary old date - everything will be updated update = date(2011, 1, 1) for k in progressbar(caltechdata_keys, redirect_stdout=True): existing, err = dataset.read(caltechdata_collection, k) if err != "": print(f"Unexpected error on read: {err}") record_update = datetime.fromisoformat(existing["updated"]).date() if record_update > update: keys_to_update.append(k) if not os.path.isdir(collection): if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = "https://data.caltech.edu/records/" for k in progressbar(keys_to_update): url = base_url + str(k) + "/revisions" response = requests.get(url) revisions = response.json() for num, metadata in enumerate(revisions): key = f"{k}-{num}" if dataset.has_key(collection, key) == False: dataset.create(collection, key, metadata) # Save date in file today = date.today().isoformat() with open("historyupdate", "w") as outfile: outfile.write(today)
def test_join(t, collection_name): key = "test_join1" obj1 = {"one": 1} obj2 = {"two": 2} if dataset.status(collection_name) == False: t.error("Failed, collection status is False,", collection_name) return ok = dataset.has_key(collection_name, key) err = '' if ok == True: ok = dataset.update(collection_nane, key, obj1) else: ok = dataset.create(collection_name, key, obj1) if ok == False: err = dataset.error_message() t.error( f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}' ) return if dataset.join(collection_name, key, obj2, overwrite=False) == False: err = dataset.error_message() t.error( f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') if obj_result.get('one') != 1: t.error(f'Failed to join append key {key}, {obj_result}') if obj_result.get("two") != 2: t.error(f'Failed to join append key {key}, {obj_result}') obj2['one'] = 3 obj2['two'] = 3 obj2['three'] = 3 if dataset.join(collection_name, key, obj2, overwrite=True) == False: err = dataset.error_message() t.error( f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') for k in obj_result: if k != '_Key' and obj_result[k] != 3: t.error('Failed to update value in join overwrite', k, obj_result)
def build_aggregate(collection): """Build a collection for usage by month. Always creates collection from scratch""" # Delete existing collection if os.path.isdir(collection): shutil.rmtree(collection) if not dataset.init(collection): print("Dataset failed to init collection") exit() # Find time periods start = datetime.fromisoformat("2017-01-01") today = datetime.today().date().isoformat() date_list = pd.date_range(start, today, freq="MS").strftime("%Y-%m").to_list() for month in date_list: if not dataset.create(collection, month, {"report-datasets": []}): err = dataset.error_message() print(err)
def get_wos_refs(new=True): # New=True will download everything from scratch and delete any existing records collection = "all_wos.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: ok = dataset.init(collection) if ok == False: print("Dataset failed to init collection") exit() # Get access token from WOS sed as environment variable with source token.bash token = os.environ["WOSTOK"] headers = {"X-ApiKey": token, "Content-type": "application/json"} # Run query to get scope of records base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK" collected = dataset.has_key(collection, "captured") if collected == True: date = dataset.read(collection, "captured") date = date[0]["captured"] date = datetime.fromisoformat(date) current = datetime.today() diff = current - date base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D" date = datetime.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): err = dataset.update(collection, "captured", record) if err != "": print(f"Unexpected error on update: {err}") else: err = dataset.create(collection, "captured", record) if err != "": print(f"Unexpected error on create: {err}") query = "OG=(California Institute of Technology)" query = urllib.parse.quote_plus(query) url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1" response = requests.get(url, headers=headers) response = response.json() record_count = response["QueryResult"]["RecordsFound"] print(record_count, " Records from WOS") query_id = response["QueryResult"]["QueryID"] try: records = response["Data"]["Records"]["records"]["REC"] except: print(response) write_records(records, collection) # We have saved the first 100 records record_start = 101 record_count = record_count - 100 query_url = "https://api.clarivate.com/api/wos/query/" while record_count > 0: print(record_count) print(len(records), "records") if record_count > 100: url = ( query_url + str(query_id) + "?count=100&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() try: records = response["Records"]["records"]["REC"] except: print(response) write_records(records, collection) record_start = record_start + 100 record_count = record_count - 100 else: url = ( query_url + str(query_id) + "?count=" + str(record_count) + "&firstRecord=" + str(record_start) ) response = requests.get(url, headers=headers) response = response.json() records = response["Records"]["records"]["REC"] write_records(records, collection) record_count = 0 print("Downloaded all records ")
def test_sync_csv(t, c_name): # Setup test collection if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}) failed, {err}') return # Setup test CSV instance t_data = [{ "_Key": "one", "value": 1 }, { "_Key": "two", "value": 2 }, { "_Key": "three", "value": 3 }] csv_name = c_name.strip(".ds") + ".csv" if os.path.exists(csv_name): os.remove(csv_name) with open(csv_name, 'w') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"]) csv_writer.writeheader() for obj in t_data: csv_writer.writerow(obj) # Import CSV into collection if dataset.import_csv(c_name, csv_name, True) == False: err = dataset.error_message() t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}') return for key in ["one", "two", "three"]: if dataset.has_key(c_name, key) == False: t.error(f"expected has_key({key}) == True, got False") if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key('five') == False, got True") if dataset.create(c_name, "five", {"value": 5}) == False: err = dataset.error_message() t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}') return # Setup frame frame_name = 'test_sync' keys = dataset.keys(c_name) if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"], ["_Key", "value"]) == False: err = dataset.error_message() t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}') return #NOTE: Tests for sync_send_csv and sync_receive_csv if dataset.sync_send_csv(c_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}') return with open(csv_name) as fp: src = fp.read() if 'five' not in src: t.error(f"expected 'five' in src, got {src}") # Now remove "five" from collection if dataset.delete(c_name, "five") == False: err = dataset.error_message() t.error(f'delete({c_name}, "five") failed, {err}') return if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key(five) == False, got True") return if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False: err = dataset.error_message() t.error( f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}' ) return if dataset.has_key(c_name, "five") == False: t.error(f"expected has_key(five) == True, got False") return
def get_crossref_refs(prefix, done=False, new=True): # New=True will download everything from scratch and delete any existing records collection = "crossref_refs.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = ( "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix=" + prefix) collected = dataset.has_key(collection, "captured") cursor = "" count = 0 while cursor != None: if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("error on read: " + err) date = date["captured"] print(date) url = base_url + "&from-collected-date=" + date else: url = base_url if cursor != "": url = url + "&cursor=" + cursor print(url) r = requests.get(url) records = r.json() if records["status"] == "failed": print(records) break for rec in records["message"]["events"]: # Save results in dataset print(count, rec["id"]) count = count + 1 # Just for prettyness if not dataset.create(collection, rec["id"], rec): err = dataset.error_message() print("Error in saving record: " + err) if cursor == records["message"]["next-cursor"]: # Catches bug where we get the same curser back at end of results break if records["message"]["total-results"] > count: cursor = records["message"]["next-cursor"] else: cursor = None if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("Error in reading date: " + err) date = date["captured"] # Check Deleted cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Delete results in dataset print("Deleted: ", rec["id"]) if not dataset.delete(collection, rec["id"]): err = dataset.error_message() print(f"Unexpected error on read: {err}") cursor = records["message"]["next-cursor"] # Check Edited cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Update results in dataset print("Update: ", rec["id"]) if not dataset.update(collection, rec["id"], rec): err = dataset.error_message() print(f"Unexpected error on write: {err}") cursor = records["message"]["next-cursor"] if done: date = datetime.date.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): if not dataset.update(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on update: {err}") else: if not dataset.create(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on create: {err}")
log.print(f"Ingesting {f_name}") metadata = frontmatter(f_name) with open(f_name) as f: src = f.read() if "id" in metadata: key = str(metadata["id"]) if dataset.has_key(c_name, key): err = dataset.update(c_name, key, { "metadata": metadata, "content": f_name, "src": src }) else: err = dataset.create(c_name, key, { "metadata": metadata, "content": f_name, "src": src }) if err != "": log.fatal(err) else: log.print(f"Warning, no front matter for {f_name}") # for each dataset record render appropriate HTML pages keys = dataset.keys(c_name) for key in keys: page, err = dataset.read(c_name, key) if err != "": log.print(f"WARNING: could not read {key} from {c_name}, skipping") if 'output' in page['metadata']: p = page['metadata']['output']
def test_keys(t, collection_name): '''test_keys(collection_name) test getting, filter and sorting keys''' # Test count after delete key_list = dataset.keys(collection_name) cnt = dataset.count(collection_name) if cnt != 0: t.error("Failed, expected zero records, got", cnt, key_list) # # Generate multiple records for collection for testing keys # test_records = { "gutenberg:21489": { "title": "The Secret of the Island", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "http://www.gutenberg.org/ebooks/21489", "categories": "fiction, novel" }, "gutenberg:2488": { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488", "categories": "fiction, novel" }, "gutenberg:21839": { "title": "Sense and Sensibility", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jane", "family": "Austin" }], "url": "http://www.gutenberg.org/ebooks/21839", "categories": "fiction, novel" }, "gutenberg:3186": { "title": "The Mysterious Stranger, and Other Stories", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Mark", "family": "Twain" }], "url": "http://www.gutenberg.org/ebooks/3186", "categories": "fiction, short story" }, "hathi:uc1321060001561131": { "title": "A year of American travel - Narrative of personal experience", "formats": ["pdf"], "authors": [{ "given": "Jessie Benton", "family": "Fremont" }], "url": "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9", "categories": "non-fiction, memoir" } } test_count = len(test_records) for k in test_records: v = test_records[k] if dataset.create(collection_name, k, v) == False: err = dataset.error_message() t.error("Failed, could not add", k, "to", collection_name, ', ', err) # Test keys, filtering keys and sorting keys all_keys = dataset.keys(collection_name) if len(all_keys) != test_count: t.error("Expected", test_count, "all_keys back, got", keys) #dataset.verbose_on() filter_expr = '(eq .categories "non-fiction, memoir")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 1: t.error( f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for", filter_expr, "got", filtered_keys) filter_expr = '(contains .categories "novel")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 3: t.error( f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for", filter_expr, "got", filtered_keys) sort_expr = '+.title' filter_expr = '(contains .categories "novel")' sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr) if len(sorted_keys) != 3: t.error( f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for", filter_expr, "got", sorted_keys) expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"] for i, k in enumerate(expected_keys): if i < len(sorted_keys) and sorted_keys[i] != k: obj1, _ = dataset.read(collection_name, k) obj2, _ = dataset.read(collection_name, sorted_keys[i]) t.error( f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")' )
def test_basic(t, collection_name): '''test_basic(collection_name) runs tests on basic CRUD ops''' # Setup a test record key = "2488" value = { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488" } # We should have an empty collection, we will create our test record. if dataset.create(collection_name, key, value) == False: err = dataset.error_message() t.error(f'create({collection_name}, {key}, {value}) failed, {err}') return # Check to see that we have only one record key_count = dataset.count(collection_name) if key_count != 1: t.error(f"Failed, expected count to be 1, got {key_count}") # Do a minimal test to see if the record looks like it has content keyList = dataset.keys(collection_name) rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error(f"epxected {rec[k]} got {v}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error(f"Failed, expected {k} with list v, got {v}") # Test updating record value["verified"] = True if dataset.update(collection_name, key, value) == False: err = dataset.error_message() t.error(f"update({collection_name}, {key}, {value}) failed, {err}") rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error("expected {rec[k]} got {v} for key {k}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error("Failed, expected {k} with a list for v, got {v}") # Test path to record expected_s = "/".join( [collection_name, "pairtree", "24", "88", (key + ".json")]) expected_l = len(expected_s) p = dataset.path(collection_name, key) if len(p) != expected_l: t.error("Failed, expected length", expected_l, "got", len(p)) if p != expected_s: t.error("Failed, expected", expected_s, "got", p) # Test listing records l = dataset.list(collection_name, [key]) if len(l) != 1: t.error( f"list({collection_name}, [{key}]) failed, list should return an array of one record, got", l) return # test deleting a record if dataset.delete(collection_name, key) == False: err = dataset.error_message() t.error("Failed, could not delete record", key, ", ", err)
dupe.affiliations) if subject.years not in dupe.years: dupe.years += subject.years if subject.names not in dupe.names: dupe.names += subject.names if subject.links not in dupe.links: dupe.links += subject.links print("Total collaborators: ", len(deduped)) collab = 'collaborators.ds' subprocess.run(['rm', '-rf', collab]) dataset.init(collab) for d in deduped: dataset.create(collab, d.ca_id, d.write()) #Export to Google Sheet os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json" #Google sheet ID for output f_name = 'frm' sheet_name = "Sheet1" sheet_range = "A1:CZ" export_list = [".names", ".years", ".affiliations", ".links"] title_list = ["name", "years", "affiliations", "links"] keys = dataset.keys(collab) if dataset.has_frame(collab, f_name): dataset.delete_frame(collab, f_name) frame, err = dataset.frame(collab, f_name, keys, export_list, title_list) if err != '': print(err)
def test_issue43(t, collection_name, csv_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) if os.path.exists(csv_name): os.remove(csv_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'Failed, need a {collection_name} to run test, {err}') return table = { "r1": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" }, "r2": { "c1": "one", "c3": "three", "c4": "four" }, "r3": { "c1": "one", "c2": "two", "c4": "four" }, "r4": { "c1": "one", "c2": "two", "c3": "three" }, "r5": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" } } for key in table: row = table[key] if dataset.create(collection_name, key, row) == False: err = dataset.error_message() t.error(f"Can't add test row {key} to {collection_name}, {err}") return dataset.use_strict_dotpath(False) # Setup frame frame_name = 'f1' keys = dataset.keys(collection_name) if dataset.frame_create(collection_name, frame_name, keys, ["._Key", ".c1", ".c2", ".c3", ".c4"], ["_Key", "c1", "c2", "c3", "c4"]) == False: err = dataset.error_message() t.error(err) return if dataset.export_csv(collection_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}' ) return with open(csv_name, mode='r', encoding='utf-8') as f: rows = f.read() for row in rows.split('\n'): if len(row) > 0: cells = row.split(',') if len(cells) < 5: t.error(f'row error {csv_name} for {cells}')
#We're just going to use the the first other identifier as filler if link == '': link = idv['value'] else: #Just one identifier if identifiers['type'] == 'xref_doi': link = 'https://doi.org/' + idv['value'] elif identifiers['type'] == 'doi': link = 'https://doi.org/' + idv['value'] else: link = idv['value'] record =\ {'id':uid,'title':title,'journal':journal,'authors':author_list,'identifiers':identifier_list,'affiliations':affiliation_list,'link':link,'year':publication_date.year} dataset.create(collection, link, record) #Export to Google Sheet os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json" #Google sheet ID for output sheet_name = "Sheet1" sheet_range = "A1:CZ" f_name = 'f_name' export_list = [".link", ".title", ".journal", ".year"] title_list = ["link", "title", "journal", "year"] keys = dataset.keys(collection) if dataset.has_frame(collection, f_name): dataset.delete_frame(collection, f_name) frame, err = dataset.frame(collection, f_name, keys, export_list, title_list) if err != '':
def test_frame_objects(t, c_name): if dataset.status(c_name) == True: dataset.close(c_name) if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}), {err}') return data = [{ "id": "A", "nameIdentifiers": [{ "nameIdentifier": "0000-000X-XXXX-XXXX", "nameIdentifierScheme": "ORCID", "schemeURI": "http://orcid.org/" }, { "nameIdentifier": "H-XXXX-XXXX", "nameIdentifierScheme": "ResearcherID", "schemeURI": "http://www.researcherid.com/rid/" }], "two": 22, "three": 3.0, "four": ["one", "two", "three"] }, { "id": "B", "two": 2000, "three": 3000.1 }, { "id": "C" }, { "id": "D", "nameIdentifiers": [{ "nameIdentifier": "0000-000X-XXXX-XXXX", "nameIdentifierScheme": "ORCID", "schemeURI": "http://orcid.org/" }], "two": 20, "three": 334.1, "four": [] }] keys = [] dot_paths = [ "._Key", ".nameIdentifiers", ".nameIdentifiers[:].nameIdentifier", ".two", ".three", ".four" ] labels = [ "id", "nameIdentifiers", "nameIdentifier", "two", "three", "four" ] for row in data: key = row['id'] keys.append(key) err = dataset.create(c_name, key, row) f_name = 'f1' if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False: err = dataset.error_message() t.error( f'frame_create({c_name}, {f_name}, {keys}, {dot_paths}, {labels}), {err}' ) return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) != len(keys): t.error(f'expected {len(keys)}, got {len(f_keys)}') if dataset.frame_refresh(c_name, f_name) == False: err = dataset.error_message() t.error(f'frame_reframe({c_name}, {f_name}), {err}') l = dataset.frames(c_name) if len(l) != 1 or l[0] != 'f1': t.error(f"expected one frame name, f1, got {l}") object_result = dataset.frame_objects(c_name, f_name) if len(object_result) != 4: t.error( f'Did not get correct number of objects back, expected 4 got {len(object_result)}, {object_result}' ) count_nameId = 0 count_nameIdObj = 0 for obj in object_result: if 'id' not in obj: t.error('Did not get id in object') if 'nameIdentifiers' in obj: count_nameId += 1 for idv in obj['nameIdentifiers']: if 'nameIdentifier' not in idv: t.error('Missing part of object') if 'nameIdentifier' in obj: count_nameIdObj += 1 if "0000-000X-XXXX-XXXX" not in obj['nameIdentifier']: t.error('Missing object in complex dot path') if count_nameId != 2: t.error( f"Incorrect number of nameIdentifiers elements, expected 2, got {count_nameId}" ) if count_nameIdObj != 2: t.error( f"Incorrect number of nameIdentifier elements, expected 2, got {count_nameIdObj}" ) if dataset.delete_frame(c_name, f_name) == False: err = dataset.error_message() t.error(f'delete_frame({c_name}, {f_name}), {err}')
def read_records(data, current, collection): # read records in 'hits' structure for record in data: rid = str(record["id"]) metadata = record["metadata"] download = False # Flag for downloading files # Do we need to download? if "electronic_location_and_access" in metadata: # Get information about already backed up files: existing_size = [] existing_names = [] if rid in current: # Get existing files attachments = dataset.attachments(collection, rid) for a in attachments: split = a.split(" ") #Handle file names with spaces; size will always be last size = split[-1] name = a.replace(f' {size}', '') existing_names.append(name) existing_size.append(size) # Look at all files count = len(metadata["electronic_location_and_access"]) dl = 0 for erecord in metadata["electronic_location_and_access"]: # Check if file has been downloaded size = erecord["file_size"] name = erecord["electronic_name"][0] if size in existing_size and name in existing_names: dl = dl + 1 if dl == count: print( "files already downloaded ", existing_size, existing_names, ) download = False else: print("file mismatch ", existing_size, existing_names, dl, count) download = True # Save results in dataset print("Saving record " + rid) if rid in current: print('Update') update = dataset.update(collection, rid, record) if update == False: print( f"Failed, could not create record: {dataset.error_message()}" ) exit() else: create = dataset.create(collection, rid, record) print('CREATED', create, rid) if create == False: print( f"Failed, could not create record: {dataset.error_message()}" ) exit() if download == True: files = [] print("Downloading files for ", rid) for erecord in metadata["electronic_location_and_access"]: f = download_file(erecord, rid) if f != None: files.append(f) print(files) print("Attaching files") if len(files) != 0: err = dataset.attach(collection, rid, files) if err == False: print(f"Failed on attach {dataset.error_message()}") exit() for f in files: if f != None: os.remove(f)
def get_cd_github(new=True): collection = "github_records.ds" if new == True: os.system("rm -rf " + collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() url = "https://data.caltech.edu/api/records" response = requests.get(url + "/?size=1000&q=subjects:GitHub") hits = response.json() for h in hits["hits"]["hits"]: rid = str(h["id"]) record = h["metadata"] result = dataset.has_key(collection, rid) if result == False: dataset.create(collection, rid, record) print("Downloading files for ", rid) codemeta = False for erecord in record["electronic_location_and_access"]: f = download_file(erecord, rid) # We're just looking for the zip file if f.split(".")[-1] == "zip": zip_files = subprocess.check_output( ["unzip", "-l", f.rstrip()], universal_newlines=True).splitlines() i = 4 # Ignore header line = zip_files[i] while line[0] != "-": split = line.split("/") fname = split[1] if fname == "codemeta.json": sp = line.split(" ")[-1] os.system("unzip -j " + f.rstrip() + " " + sp + " -d .") codemeta = True i = i + 1 line = zip_files[i] # Will only identify codemeta files in root of repo # Trash downloaded files - extracted codemeta.json not impacted print("Trash " + f) os.system("rm " + f) if codemeta == True: print(collection, rid) response = dataset.attach(collection, rid, ["codemeta.json"]) print("Attachment ", response) os.system("rm codemeta.json") print("Trash codemeta.json")