def test_check_repair(t, collection_name): t.print("Testing status on", collection_name) # Make sure we have a left over collection to check and repair if os.path.exists(collection_name) == True: shutil.rmtree(collection_name) if dataset.status(collection_name) == True: dataset.close(collection_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'init({collection_name}) failed, {err}') return if dataset.status(collection_name) == False: t.error( f"Failed, expected dataset.status() == True, got False for {collection_name}" ) return if dataset.has_key(collection_name, 'one') == False: if dataset.create(collection_name, 'one', {"one": 1}) == False: err = dataset.error_message() t.error( f'create({collection_name}, "one", {"one": 1}) failed, {err}') t.print(f"Testing check on {collection_name}") # Check our collection if not (dataset.check(collection_name) == True): err = dataset.error_message() t.error( "Failed, (before break) expected check True, got False for {collection_name} (err: {err})" ) return # Break and recheck our collection print(f"Removing {collection_name}/collection.json to cause a fail") if os.path.exists(collection_name + "/collection.json"): os.remove(collection_name + "/collection.json") print(f"Testing check on (broken) {collection_name}") if not (dataset.check(collection_name) == False): err = dataset.error_message() t.error( f"Failed, (after break) expected check False got True for {collection_name} (err: {err})" ) else: t.print(f"Should have see error output for broken {collection_name}") # Repair our collection t.print("Testing repair on", collection_name) if dataset.repair(collection_name) == False: err = dataset.error_message() t.error("Failed, expected repair to return True, got, ", err) if os.path.exists(os.path.join(collection_name, "collection.json")) == False: t.error( f"Failed, expected recreated {collection_name}/collection.json")
def make_link_history(collection, resolver, url, note): """Make an entry in our link history collection""" now = datetime.today().isoformat() # Run checks on both resoler and final URL try: target = requests.get(url) except requests.exceptions.ConnectionError: target = requests.Response() target.status_code = 404 target.url = '' if target.status_code != 200: print( f"Target URL {url} returns Error status code {target.status_code}") if links_differ(target.url, url): print(f"Target URL '{url}' redirects to '{target.url}'") try: get = requests.get(f"https://resolver.library.caltech.edu/{resolver}") except requests.exceptions.ConnectionError: get = requests.Response() get.status_code = 404 get.url = '' if links_differ(get.url, url): print(f"Mismatch between expected url '{url}' and actual '{get.url}'") if get.status_code != 200: print( f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}" ) entry = { "expected-url": url, "url": get.url, "modified": now, "code": get.status_code, "note": note, } # If existing, push into history if dataset.has_key(collection, resolver): existing, err = dataset.read(collection, resolver) if err != "": print(err) exit() if save_history(existing, url, get): past_history = existing.pop("history") past_history.append(existing) entry["history"] = past_history if not dataset.update(collection, resolver, entry): print(dataset.error_message()) exit() else: entry["history"] = [] if not dataset.create(collection, resolver, entry): print(dataset.error_message()) exit()
def test_setup(t, collection_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error("init({collection_name}) failed, {err}") return
def get_caltechdata(collection, production=True, datacite=False): """Harvest all records from CaltechDATA . Always creates collection from scratch""" # Delete existing collection if os.path.isdir(collection): shutil.rmtree(collection) if not dataset.init(collection): print("Dataset failed to init collection") exit() if production == True: url = "https://data.caltech.edu/api/records" else: url = "https://cd-sandbox.tind.io/api/records" response = requests.get(url + "/?size=9000") hits = response.json() print(hits) for h in progressbar(hits["hits"]["hits"]): rid = str(h["id"]) # Get enriched metadata records (including files) if datacite == False: metadata = decustomize_schema(h["metadata"], True, True, True) metadata["updated"] = h["updated"] else: # Get just DataCite metadata metadata = decustomize_schema(h["metadata"]) if not dataset.create(collection, rid, metadata): err = dataset.error_message() print(err)
def build_usage(caltechdata_collection, usage_collection): """Build collection of records that contain CaltechDATA usage information""" if not os.path.isdir(usage_collection): if not dataset.init(usage_collection): print("Dataset failed to init collection") exit() # Write date to start collecting statistics for new collection dataset.create(usage_collection, "end-date", {"end-date": 1485907200}) # Build out structure for all CaltechDATA records ids = dataset.keys(caltechdata_collection) for k in ids: if dataset.has_key(usage_collection, k) == False: metadata, err = dataset.read(caltechdata_collection, k) # When record was submitted to CaltechDATA: rdate = None submitted = None issued = None if "dates" in metadata: doi = metadata["identifier"]["identifier"] for date in metadata["dates"]: if date["dateType"] == "Submitted": rdate = date["date"] if date["dateType"] == "Updated": submitted = date["date"] if date["dateType"] == "Issued": issued = date["date"] if rdate == None: if submitted != None: rdate = submitted else: rdate = issued else: # Dummy values for junk records rdate = "2020-04-01" doi = "" # Dataset is the only supported type in the spec and we are # following the dataset standards for usage # All dates are the date added to CaltechDATA, which is # the apropriate 'publication' date even if content was available # earlier record_data = { "dataset-id": [{"type": "doi", "value": doi}], "uri": "https://data.caltech.edu/records/" + k, "publisher": "CaltechDATA", "platform": "CaltechDATA", "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}], "yop": rdate.split("-")[0], "data-type": "dataset", "dataset-dates": [{"type": "pub-date", "value": rdate}], "dataset-title": metadata["titles"][0]["title"], "performance": [], "grand-total-unique-investigations": 0, "grand-total-unique-requests": 0, } if not dataset.create(usage_collection, k, record_data): err = dataset.error_message() print(err) exit()
def test_frame(t, c_name): if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(err) return data = [{ "id": "A", "one": "one", "two": 22, "three": 3.0, "four": ["one", "two", "three"] }, { "id": "B", "two": 2000, "three": 3000.1 }, { "id": "C" }, { "id": "D", "one": "ONE", "two": 20, "three": 334.1, "four": [] }] keys = [] dot_paths = ["._Key", ".one", ".two", ".three", ".four"] labels = ["_Key", "one", "two", "three", "four"] for row in data: key = row['id'] keys.append(key) dataset.create(c_name, key, row) f_name = 'f1' if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False: err = dataset.error_message() t.error(err) if dataset.frame_reframe(c_name, f_name) == False: err = dataset.error_message() t.error(err) l = dataset.frames(c_name) if len(l) != 1 or l[0] != 'f1': t.error(f"expected one frame name, f1, got {l}") if dataset.delete_frame(c_name, f_name) == False: err = dataset.error_message() t.error(f'delete_frame({c_name}, {f_name}), {err}')
def test_join(t, collection_name): key = "test_join1" obj1 = {"one": 1} obj2 = {"two": 2} if dataset.status(collection_name) == False: t.error("Failed, collection status is False,", collection_name) return ok = dataset.has_key(collection_name, key) err = '' if ok == True: ok = dataset.update(collection_nane, key, obj1) else: ok = dataset.create(collection_name, key, obj1) if ok == False: err = dataset.error_message() t.error( f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}' ) return if dataset.join(collection_name, key, obj2, overwrite=False) == False: err = dataset.error_message() t.error( f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') if obj_result.get('one') != 1: t.error(f'Failed to join append key {key}, {obj_result}') if obj_result.get("two") != 2: t.error(f'Failed to join append key {key}, {obj_result}') obj2['one'] = 3 obj2['two'] = 3 obj2['three'] = 3 if dataset.join(collection_name, key, obj2, overwrite=True) == False: err = dataset.error_message() t.error( f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}' ) obj_result, err = dataset.read(collection_name, key) if err != '': t.error(f'Unexpected error for {key} in {collection_name}, {err}') for k in obj_result: if k != '_Key' and obj_result[k] != 3: t.error('Failed to update value in join overwrite', k, obj_result)
def test_issue32(t, collection_name): if dataset.create(collection_name, "k1", {"one": 1}) == False: err = dataset.error_message() t.error("Failed to create k1 in", collection_name, ', ', err) return if dataset.has_key(collection_name, "k1") == False: t.error("Failed, has_key k1 should return", True) if dataset.has_key(collection_name, "k2") == True: t.error("Failed, has_key k2 should return", False)
def test_issue12(t, c_name): src = '''[ {"id": "1", "c1": 1, "c2": 2, "c3": 3 }, {"id": "2", "c1": 2, "c2": 2, "c3": 3 }, {"id": "3", "c1": 3, "c2": 3, "c3": 3 }, {"id": "4", "c1": 1, "c2": 1, "c3": 1 }, {"id": "5", "c1": 6, "c2": 6, "c3": 6 } ]''' #dataset.verbose_on() # DEBUG #dataset.use_strict_dotpath(True) # DEBUG if dataset.status(c_name) == False: if not dataset.init(c_name): err = dataset.error_message() t.error(f'failed to create {c_name}') return objects = json.loads(src) for obj in objects: key = obj['id'] if dataset.has_key(c_name, key): dataset.update(c_name, key, obj) else: dataset.create(c_name, key, obj) f_names = dataset.frames(c_name) for f_name in f_names: ok = dataset.delete_frame(c_name, f_name) if ok == False: err = dataset.error_message() t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"') return if dataset.has_frame(c_name, f_name) == True: t.error( f'Failed to delete frame {c_name} from {c_name}, frame still exists' ) return f_name = 'issue12' dot_paths = [".c1", "c3"] labels = [".col1", ".col3"] keys = dataset.keys(c_name) if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels): err = dataset.error_message() t.error(f'failed to create {f_name} from {c_name}, {err}') if not dataset.has_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected frame {f_name} to exists, {err}') return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) == 0: err = dataset.error_message() t.error(f'expected keys in {f_name}, got zero, {err}') return f_objects = dataset.frame_objects(c_name, f_name) if len(f_objects) == 0: err = dataset.error_message() t.error(f'expected objects in {f_name}, got zero, {err}') return if not dataset.delete_frame(c_name, f_name): err = dataset.error_message() t.error(f'expected to delete {f_name} in {c_name}, {err}')
def get_records(dot_paths, f_name, d_name, keys, labels=None, clear=True): if dataset.has_frame(d_name, f_name): if clear: dataset.delete_frame(d_name, f_name) else: dataset.frame_refresh(d_name, f_name) return dataset.frame_objects(d_name, f_name) if labels: if not dataset.frame_create(d_name, f_name, keys, dot_paths, labels): err = dataset.error_message() print(f"ERROR: Can't create {f_name} in {d_name}, {err}") else: # If labels arn't provided, just base on dot path labels = [] for d in dot_paths: labels.append(d.split(".")[-1]) if not dataset.frame_create(d_name, f_name, keys, dot_paths, labels): err = dataset.error_message() print(f"ERROR: Can't create {f_name} in {d_name}, {err}") return dataset.frame_objects(d_name, f_name)
def test_clone_sample(t, c_name, sample_size, training_name, test_name): if os.path.exists(training_name): shutil.rmtree(training_name) if os.path.exists(test_name): shutil.rmtree(test_name) if dataset.clone_sample(c_name, training_name, test_name, sample_size) == False: err = dataset.error_message() t.error( f"can't clone sample {c_name} size {sample_size} into {training_name}, {test_name} error {err}" )
def build_aggregate(collection): """Build a collection for usage by month. Always creates collection from scratch""" # Delete existing collection if os.path.isdir(collection): shutil.rmtree(collection) if not dataset.init(collection): print("Dataset failed to init collection") exit() # Find time periods start = datetime.fromisoformat("2017-01-01") today = datetime.today().date().isoformat() date_list = pd.date_range(start, today, freq="MS").strftime("%Y-%m").to_list() for month in date_list: if not dataset.create(collection, month, {"report-datasets": []}): err = dataset.error_message() print(err)
def match_codemeta(): collection = "github_records.ds" keys = dataset.keys(collection) for k in keys: existing, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") if "completed" not in existing: print("Processing new record ", k) if dataset.attachments(collection, k) != "": dataset.detach(collection, k) # Update CaltechDATA token = os.environ["TINDTOK"] infile = open("codemeta.json", "r") try: meta = json.load(infile) except: print("Invalid json file - Skipping forever ", k) else: standardized = codemeta_to_datacite(meta) # Check that all records have a GitHub subject tag add = True for s in standardized["subjects"]: if s["subject"] == "Github": add = False if s["subject"] == "GitHub": add = False if add == True: standardized["subjects"].append({"subject": "GitHub"}) response = caltechdata_edit(token, k, standardized, {}, {}, True) print(response) os.system("rm codemeta.json") existing["completed"] = "True" if not dataset.update(collection, k, existing): err = dataset.error_message() print(f"Unexpected error on read: {err}")
def get_crossref_refs(prefix, done=False, new=True): # New=True will download everything from scratch and delete any existing records collection = "crossref_refs.ds" if new == True: if os.path.exists(collection) == True: shutil.rmtree(collection) if os.path.isdir(collection) == False: if not dataset.init(collection): print("Dataset failed to init collection") exit() base_url = ( "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix=" + prefix) collected = dataset.has_key(collection, "captured") cursor = "" count = 0 while cursor != None: if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("error on read: " + err) date = date["captured"] print(date) url = base_url + "&from-collected-date=" + date else: url = base_url if cursor != "": url = url + "&cursor=" + cursor print(url) r = requests.get(url) records = r.json() if records["status"] == "failed": print(records) break for rec in records["message"]["events"]: # Save results in dataset print(count, rec["id"]) count = count + 1 # Just for prettyness if not dataset.create(collection, rec["id"], rec): err = dataset.error_message() print("Error in saving record: " + err) if cursor == records["message"]["next-cursor"]: # Catches bug where we get the same curser back at end of results break if records["message"]["total-results"] > count: cursor = records["message"]["next-cursor"] else: cursor = None if collected == True: date, err = dataset.read(collection, "captured") if err != "": print("Error in reading date: " + err) date = date["captured"] # Check Deleted cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Delete results in dataset print("Deleted: ", rec["id"]) if not dataset.delete(collection, rec["id"]): err = dataset.error_message() print(f"Unexpected error on read: {err}") cursor = records["message"]["next-cursor"] # Check Edited cursor = "" while cursor != None: del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref" full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor r = requests.get(full) records = r.json() for rec in records["message"]["events"]: # Update results in dataset print("Update: ", rec["id"]) if not dataset.update(collection, rec["id"], rec): err = dataset.error_message() print(f"Unexpected error on write: {err}") cursor = records["message"]["next-cursor"] if done: date = datetime.date.today().isoformat() record = {"captured": date} if dataset.has_key(collection, "captured"): if not dataset.update(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on update: {err}") else: if not dataset.create(collection, "captured", record): err = dataset.error_message() print(f"Unexpected error on create: {err}")
def match_cd_refs(): token = os.environ["TINDTOK"] matches = [] collection = "caltechdata.ds" keys = dataset.keys(collection) if "mediaupdate" in keys: keys.remove("mediaupdate") # Get event data results event_data = "crossref_refs.ds" event_keys = dataset.keys(event_data) event_keys.remove("captured") f_name = "match_cd_refs" dot_paths = [".obj_id", ".id", ".subj_id"] labels = ["obj_id", "id", "subj_id"] print("Getting Event Data Records") if dataset.has_frame(event_data, f_name): if not dataset.frame_reframe(event_data, f_name, event_keys): err = dataset.error_message() print(f"Failed to reframe {f_name} in {event_data}, {err}") exit() elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels): err = dataset.error_message() print(f"Failed to create frame {f_name} in {event_data}, {err}") exit() grid = dataset.frame_grid(event_data, f_name) df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"]) grouped = df.groupby(["obj_id"]) groups = grouped.groups # Look at all CaltechDATA records for k in keys: # Collect matched new links for the record record_matches = [] print(k) metadata, err = dataset.read(collection, k) if err != "": print(f"Unexpected error on read: {err}") doi = "https://doi.org/" + metadata["identifier"]["identifier"] if doi in groups: hits = grouped.get_group(doi) for index, h in hits.iterrows(): # Trigger for whether we already have this link new = True if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: if m["relatedIdentifier"] in h["subj_id"]: new = False if new == True: match = h["subj_id"] print(match) print(h["obj_id"]) inputv = input("Do you approve this link? Type Y or N: ") if inputv == "Y": record_matches.append(match) # If we have to update record if len(record_matches) > 0: ids = [] if "relatedIdentifiers" in metadata: for m in metadata["relatedIdentifiers"]: ids.append(m) matches.append([k, record_matches]) # Now collect identifiers for record for match in record_matches: split = match.split("doi.org/") new_id = { "relatedIdentifier": split[1], "relatedIdentifierType": "DOI", "relationType": "IsCitedBy", } ids.append(new_id) newmetadata = {"relatedIdentifiers": ids} response = caltechdata_edit(token, k, newmetadata, {}, {}, True) print(response) return matches
def test_basic(t, collection_name): '''test_basic(collection_name) runs tests on basic CRUD ops''' # Setup a test record key = "2488" value = { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488" } # We should have an empty collection, we will create our test record. if dataset.create(collection_name, key, value) == False: err = dataset.error_message() t.error(f'create({collection_name}, {key}, {value}) failed, {err}') return # Check to see that we have only one record key_count = dataset.count(collection_name) if key_count != 1: t.error(f"Failed, expected count to be 1, got {key_count}") # Do a minimal test to see if the record looks like it has content keyList = dataset.keys(collection_name) rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error(f"epxected {rec[k]} got {v}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error(f"Failed, expected {k} with list v, got {v}") # Test updating record value["verified"] = True if dataset.update(collection_name, key, value) == False: err = dataset.error_message() t.error(f"update({collection_name}, {key}, {value}) failed, {err}") rec, err = dataset.read(collection_name, key) if err != "": t.error(f"Unexpected error for {key} in {collection_name}, {err}") for k, v in value.items(): if not isinstance(v, list): if k in rec and rec[k] == v: t.print("OK, found", k, " -> ", v) else: t.error("expected {rec[k]} got {v} for key {k}") else: if k == "formats" or k == "authors": t.print("OK, expected lists for", k, " -> ", v) else: t.error("Failed, expected {k} with a list for v, got {v}") # Test path to record expected_s = "/".join( [collection_name, "pairtree", "24", "88", (key + ".json")]) expected_l = len(expected_s) p = dataset.path(collection_name, key) if len(p) != expected_l: t.error("Failed, expected length", expected_l, "got", len(p)) if p != expected_s: t.error("Failed, expected", expected_s, "got", p) # Test listing records l = dataset.list(collection_name, [key]) if len(l) != 1: t.error( f"list({collection_name}, [{key}]) failed, list should return an array of one record, got", l) return # test deleting a record if dataset.delete(collection_name, key) == False: err = dataset.error_message() t.error("Failed, could not delete record", key, ", ", err)
def aggregate_usage(usage_collection, month_collection): keys = dataset.keys(usage_collection) keys.remove("end-date") for k in progressbar(keys): record, err = dataset.read(usage_collection, k) if err != "": print(err) use = {} views = {} for usage in record["performance"]: split = usage["period"].split("-") month = split[0] + "-" + split[1] for u in usage["instance"]: metric = u["metric-type"] if metric == "unique-dataset-requests": if month in use: use[month] += u["count"] else: use[month] = u["count"] if metric == "unique-dataset-investigations": if month in views: views[month] += u["count"] else: views[month] = u["count"] # Strip non-counter stuff record.pop("_Key") record.pop("grand-total-unique-requests") record.pop("grand-total-unique-investigations") # go across months for view in views: split = view.split("-") date_obj = datetime(int(split[0]), int(split[1]), 1) d_range = get_month_day_range(date_obj) performance = [ { "period": { "begin-date": d_range[0].date().isoformat(), "end-date": d_range[1].date().isoformat(), }, "instance": [], } ] v = views[view] performance[0]["instance"].append( { "count": v, "metric-type": "unique-dataset-investigations", "access-method": "regular", } ) # Handle when we have both views and uses in a given month if view in use: u = use[view] performance[0]["instance"].append( { "count": u, "metric-type": "unique-dataset-requests", "access-method": "regular", } ) existing, err = dataset.read(month_collection, view) if err != "": print(err) record["performance"] = performance existing["report-datasets"].append(record) if not dataset.update(month_collection, view, existing): err = dataset.error_message() print(err) for use_date in use: # We only have use-only records left to handle if use_date not in views: u = use[use_date] split = use_date.split("-") date_obj = datetime(int(split[0]), int(split[1]), 1) d_range = get_month_day_range(date_obj) performance = [ { "period": { "begin-date": d_range[0].date().isoformat(), "end-date": d_range[1].date().isoformat(), }, "instance": [ { "count": u, "metric-type": "unique-dataset-requests", "access-method": "regular", } ], } ] existing, err = dataset.read(month_collection, view) if err != "": print(err) record["performance"] = performance existing["report-datasets"].append(record) if not dataset.update(month_collection, view, existing): err = dataset.error_message() print(err)
def test_sync_csv(t, c_name): # Setup test collection if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}) failed, {err}') return # Setup test CSV instance t_data = [{ "_Key": "one", "value": 1 }, { "_Key": "two", "value": 2 }, { "_Key": "three", "value": 3 }] csv_name = c_name.strip(".ds") + ".csv" if os.path.exists(csv_name): os.remove(csv_name) with open(csv_name, 'w') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"]) csv_writer.writeheader() for obj in t_data: csv_writer.writerow(obj) # Import CSV into collection if dataset.import_csv(c_name, csv_name, True) == False: err = dataset.error_message() t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}') return for key in ["one", "two", "three"]: if dataset.has_key(c_name, key) == False: t.error(f"expected has_key({key}) == True, got False") if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key('five') == False, got True") if dataset.create(c_name, "five", {"value": 5}) == False: err = dataset.error_message() t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}') return # Setup frame frame_name = 'test_sync' keys = dataset.keys(c_name) if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"], ["_Key", "value"]) == False: err = dataset.error_message() t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}') return #NOTE: Tests for sync_send_csv and sync_receive_csv if dataset.sync_send_csv(c_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}') return with open(csv_name) as fp: src = fp.read() if 'five' not in src: t.error(f"expected 'five' in src, got {src}") # Now remove "five" from collection if dataset.delete(c_name, "five") == False: err = dataset.error_message() t.error(f'delete({c_name}, "five") failed, {err}') return if dataset.has_key(c_name, "five") == True: t.error(f"expected has_key(five) == False, got True") return if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False: err = dataset.error_message() t.error( f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}' ) return if dataset.has_key(c_name, "five") == False: t.error(f"expected has_key(five) == True, got False") return
def test_frame_objects(t, c_name): if dataset.status(c_name) == True: dataset.close(c_name) if os.path.exists(c_name): shutil.rmtree(c_name) if dataset.init(c_name) == False: err = dataset.error_message() t.error(f'init({c_name}), {err}') return data = [{ "id": "A", "nameIdentifiers": [{ "nameIdentifier": "0000-000X-XXXX-XXXX", "nameIdentifierScheme": "ORCID", "schemeURI": "http://orcid.org/" }, { "nameIdentifier": "H-XXXX-XXXX", "nameIdentifierScheme": "ResearcherID", "schemeURI": "http://www.researcherid.com/rid/" }], "two": 22, "three": 3.0, "four": ["one", "two", "three"] }, { "id": "B", "two": 2000, "three": 3000.1 }, { "id": "C" }, { "id": "D", "nameIdentifiers": [{ "nameIdentifier": "0000-000X-XXXX-XXXX", "nameIdentifierScheme": "ORCID", "schemeURI": "http://orcid.org/" }], "two": 20, "three": 334.1, "four": [] }] keys = [] dot_paths = [ "._Key", ".nameIdentifiers", ".nameIdentifiers[:].nameIdentifier", ".two", ".three", ".four" ] labels = [ "id", "nameIdentifiers", "nameIdentifier", "two", "three", "four" ] for row in data: key = row['id'] keys.append(key) err = dataset.create(c_name, key, row) f_name = 'f1' if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False: err = dataset.error_message() t.error( f'frame_create({c_name}, {f_name}, {keys}, {dot_paths}, {labels}), {err}' ) return f_keys = dataset.frame_keys(c_name, f_name) if len(f_keys) != len(keys): t.error(f'expected {len(keys)}, got {len(f_keys)}') if dataset.frame_refresh(c_name, f_name) == False: err = dataset.error_message() t.error(f'frame_reframe({c_name}, {f_name}), {err}') l = dataset.frames(c_name) if len(l) != 1 or l[0] != 'f1': t.error(f"expected one frame name, f1, got {l}") object_result = dataset.frame_objects(c_name, f_name) if len(object_result) != 4: t.error( f'Did not get correct number of objects back, expected 4 got {len(object_result)}, {object_result}' ) count_nameId = 0 count_nameIdObj = 0 for obj in object_result: if 'id' not in obj: t.error('Did not get id in object') if 'nameIdentifiers' in obj: count_nameId += 1 for idv in obj['nameIdentifiers']: if 'nameIdentifier' not in idv: t.error('Missing part of object') if 'nameIdentifier' in obj: count_nameIdObj += 1 if "0000-000X-XXXX-XXXX" not in obj['nameIdentifier']: t.error('Missing object in complex dot path') if count_nameId != 2: t.error( f"Incorrect number of nameIdentifiers elements, expected 2, got {count_nameId}" ) if count_nameIdObj != 2: t.error( f"Incorrect number of nameIdentifier elements, expected 2, got {count_nameIdObj}" ) if dataset.delete_frame(c_name, f_name) == False: err = dataset.error_message() t.error(f'delete_frame({c_name}, {f_name}), {err}')
def test_keys(t, collection_name): '''test_keys(collection_name) test getting, filter and sorting keys''' # Test count after delete key_list = dataset.keys(collection_name) cnt = dataset.count(collection_name) if cnt != 0: t.error("Failed, expected zero records, got", cnt, key_list) # # Generate multiple records for collection for testing keys # test_records = { "gutenberg:21489": { "title": "The Secret of the Island", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "http://www.gutenberg.org/ebooks/21489", "categories": "fiction, novel" }, "gutenberg:2488": { "title": "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jules", "family": "Verne" }], "url": "https://www.gutenberg.org/ebooks/2488", "categories": "fiction, novel" }, "gutenberg:21839": { "title": "Sense and Sensibility", "formats": ["epub", "kindle", "plain text"], "authors": [{ "given": "Jane", "family": "Austin" }], "url": "http://www.gutenberg.org/ebooks/21839", "categories": "fiction, novel" }, "gutenberg:3186": { "title": "The Mysterious Stranger, and Other Stories", "formats": ["epub", "kindle", "plain text", "html"], "authors": [{ "given": "Mark", "family": "Twain" }], "url": "http://www.gutenberg.org/ebooks/3186", "categories": "fiction, short story" }, "hathi:uc1321060001561131": { "title": "A year of American travel - Narrative of personal experience", "formats": ["pdf"], "authors": [{ "given": "Jessie Benton", "family": "Fremont" }], "url": "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9", "categories": "non-fiction, memoir" } } test_count = len(test_records) for k in test_records: v = test_records[k] if dataset.create(collection_name, k, v) == False: err = dataset.error_message() t.error("Failed, could not add", k, "to", collection_name, ', ', err) # Test keys, filtering keys and sorting keys all_keys = dataset.keys(collection_name) if len(all_keys) != test_count: t.error("Expected", test_count, "all_keys back, got", keys) #dataset.verbose_on() filter_expr = '(eq .categories "non-fiction, memoir")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 1: t.error( f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for", filter_expr, "got", filtered_keys) filter_expr = '(contains .categories "novel")' filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr) if len(filtered_keys) != 3: t.error( f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for", filter_expr, "got", filtered_keys) sort_expr = '+.title' filter_expr = '(contains .categories "novel")' sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr) if len(sorted_keys) != 3: t.error( f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for", filter_expr, "got", sorted_keys) expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"] for i, k in enumerate(expected_keys): if i < len(sorted_keys) and sorted_keys[i] != k: obj1, _ = dataset.read(collection_name, k) obj2, _ = dataset.read(collection_name, sorted_keys[i]) t.error( f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")' )
def test_issue43(t, collection_name, csv_name): if os.path.exists(collection_name): shutil.rmtree(collection_name) if os.path.exists(csv_name): os.remove(csv_name) if dataset.init(collection_name) == False: err = dataset.error_message() t.error(f'Failed, need a {collection_name} to run test, {err}') return table = { "r1": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" }, "r2": { "c1": "one", "c3": "three", "c4": "four" }, "r3": { "c1": "one", "c2": "two", "c4": "four" }, "r4": { "c1": "one", "c2": "two", "c3": "three" }, "r5": { "c1": "one", "c2": "two", "c3": "three", "c4": "four" } } for key in table: row = table[key] if dataset.create(collection_name, key, row) == False: err = dataset.error_message() t.error(f"Can't add test row {key} to {collection_name}, {err}") return dataset.use_strict_dotpath(False) # Setup frame frame_name = 'f1' keys = dataset.keys(collection_name) if dataset.frame_create(collection_name, frame_name, keys, ["._Key", ".c1", ".c2", ".c3", ".c4"], ["_Key", "c1", "c2", "c3", "c4"]) == False: err = dataset.error_message() t.error(err) return if dataset.export_csv(collection_name, frame_name, csv_name) == False: err = dataset.error_message() t.error( f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}' ) return with open(csv_name, mode='r', encoding='utf-8') as f: rows = f.read() for row in rows.split('\n'): if len(row) > 0: cells = row.split(',') if len(cells) < 5: t.error(f'row error {csv_name} for {cells}')
def test_attachments(t, collection_name): t.print("Testing attach, attachments, detach and prune") # Generate two files to attach. with open('a1.txt', 'w') as text_file: text_file.write('This is file a1') with open('a2.txt', 'w') as text_file: text_file.write('This is file a2') filenames = ['a1.txt', 'a2.txt'] if dataset.status(collection_name) == False: t.error("Failed,", collection_name, "missing") return keys = dataset.keys(collection_name) if len(keys) < 1: t.error("Failed,", collection_name, "should have keys") return key = keys[0] if dataset.attach(collection_name, key, filenames) == False: err = dataset.error_message() t.error("Failed, to attach files for", collection_name, key, filenames, ', ', err) return l = dataset.attachments(collection_name, key) if len(l) != 2: t.error("Failed, expected two attachments for", collection_name, key, "got", l) return #Check that attachments arn't impacted by update if dataset.update(collection_name, key, {"testing": "update"}) == False: err = dataset.error_message() t.error("Failed, to update record", collection_name, key, err) return l = dataset.attachments(collection_name, key) if len(l) != 2: t.error("Failed, expected two attachments after update for", collection_name, key, "got", l) return if os.path.exists(filenames[0]): os.remove(filenames[0]) if os.path.exists(filenames[1]): os.remove(filenames[1]) # First try detaching one file. if dataset.detach(collection_name, key, [filenames[1]]) == False: err = dataset.error_message() t.error("Failed, expected True for", collection_name, key, filenames[1], ', ', err) if os.path.exists(filenames[1]): os.remove(filenames[1]) else: t.error("Failed to detch", filenames[1], "from", collection_name, key) # Test explicit filenames detch if dataset.detach(collection_name, key, filenames) == False: err = dataset.error_message() t.error("Failed, expected True for", collection_name, key, filenames, ', ', err) for fname in filenames: if os.path.exists(fname): os.remove(fname) else: t.error("Failed, expected", fname, "to be detached from", collection_name, key) # Test detaching all files if dataset.detach(collection_name, key, []) == False: err = dataset.error_message() t.error("Failed, expected True for (detaching all)", collection_name, key, ', ', err) for fname in filenames: if os.path.exists(fname): os.remove(fname) else: t.error("Failed, expected", fname, "for detaching all from", collection_name, key) if dataset.prune(collection_name, key, [filenames[0]]) == False: err = dataset.error_messag() t.error("Failed, expected True for prune", collection_name, key, [filenames[0]], ', ', err) l = dataset.attachments(collection_name, key) if len(l) != 1: t.error("Failed, expected one file after prune for", collection_name, key, [filenames[0]], "got", l) if dataset.prune(collection_name, key, []) == False: err = dataset.error_message() t.error("Failed, expected True for prune (all)", collection_name, key, ', ', err) l = dataset.attachments(collection_name, key) if len(l) != 0: t.error("Failed, expected zero files after prune for", collection_name, key, "got", l)