Beispiel #1
0
def build_usage(caltechdata_collection, usage_collection):
    """Build collection of records that contain CaltechDATA usage
    information"""
    if not os.path.isdir(usage_collection):
        if not dataset.init(usage_collection):
            print("Dataset failed to init collection")
            exit()
        # Write date to start collecting statistics for new collection
        dataset.create(usage_collection, "end-date", {"end-date": 1485907200})
    # Build out structure for all CaltechDATA records
    ids = dataset.keys(caltechdata_collection)
    for k in ids:
        if dataset.has_key(usage_collection, k) == False:
            metadata, err = dataset.read(caltechdata_collection, k)
            # When record was submitted to CaltechDATA:
            rdate = None
            submitted = None
            issued = None
            if "dates" in metadata:
                doi = metadata["identifier"]["identifier"]
                for date in metadata["dates"]:
                    if date["dateType"] == "Submitted":
                        rdate = date["date"]
                    if date["dateType"] == "Updated":
                        submitted = date["date"]
                    if date["dateType"] == "Issued":
                        issued = date["date"]
                if rdate == None:
                    if submitted != None:
                        rdate = submitted
                    else:
                        rdate = issued
            else:
                # Dummy values for junk records
                rdate = "2020-04-01"
                doi = ""
            # Dataset is the only supported type in the spec and we are
            # following the dataset standards for usage
            # All dates are the date added to CaltechDATA, which is
            # the apropriate 'publication' date even if content was available
            # earlier
            record_data = {
                "dataset-id": [{"type": "doi", "value": doi}],
                "uri": "https://data.caltech.edu/records/" + k,
                "publisher": "CaltechDATA",
                "platform": "CaltechDATA",
                "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}],
                "yop": rdate.split("-")[0],
                "data-type": "dataset",
                "dataset-dates": [{"type": "pub-date", "value": rdate}],
                "dataset-title": metadata["titles"][0]["title"],
                "performance": [],
                "grand-total-unique-investigations": 0,
                "grand-total-unique-requests": 0,
            }
            if not dataset.create(usage_collection, k, record_data):
                err = dataset.error_message()
                print(err)
                exit()
Beispiel #2
0
def test_issue12(t, c_name):
    src = '''[
{"id": "1", "c1": 1, "c2": 2, "c3": 3 },
{"id": "2", "c1": 2, "c2": 2, "c3": 3 },
{"id": "3", "c1": 3, "c2": 3, "c3": 3 },
{"id": "4", "c1": 1, "c2": 1, "c3": 1 },
{"id": "5", "c1": 6, "c2": 6, "c3": 6 }
]'''
    #dataset.verbose_on() # DEBUG
    #dataset.use_strict_dotpath(True) # DEBUG
    if dataset.status(c_name) == False:
        if not dataset.init(c_name):
            err = dataset.error_message()
            t.error(f'failed to create {c_name}')
            return
    objects = json.loads(src)
    for obj in objects:
        key = obj['id']
        if dataset.has_key(c_name, key):
            dataset.update(c_name, key, obj)
        else:
            dataset.create(c_name, key, obj)
    f_names = dataset.frames(c_name)
    for f_name in f_names:
        ok = dataset.delete_frame(c_name, f_name)
        if ok == False:
            err = dataset.error_message()
            t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"')
            return
        if dataset.has_frame(c_name, f_name) == True:
            t.error(
                f'Failed to delete frame {c_name} from {c_name}, frame still exists'
            )
            return
    f_name = 'issue12'
    dot_paths = [".c1", "c3"]
    labels = [".col1", ".col3"]
    keys = dataset.keys(c_name)
    if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels):
        err = dataset.error_message()
        t.error(f'failed to create {f_name} from {c_name}, {err}')
    if not dataset.has_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected frame {f_name} to exists, {err}')
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) == 0:
        err = dataset.error_message()
        t.error(f'expected keys in {f_name}, got zero, {err}')
        return
    f_objects = dataset.frame_objects(c_name, f_name)
    if len(f_objects) == 0:
        err = dataset.error_message()
        t.error(f'expected objects in {f_name}, got zero, {err}')
        return
    if not dataset.delete_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected to delete {f_name} in {c_name}, {err}')
Beispiel #3
0
def get_caltechdata(collection, production=True, datacite=False):
    """Harvest all records from CaltechDATA .
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    if production == True:
        url = "https://data.caltech.edu/api/records"
    else:
        url = "https://cd-sandbox.tind.io/api/records"

    response = requests.get(url + "/?size=9000")
    hits = response.json()

    print(hits)
    for h in progressbar(hits["hits"]["hits"]):
        rid = str(h["id"])
        # Get enriched metadata records (including files)
        if datacite == False:
            metadata = decustomize_schema(h["metadata"], True, True, True)
            metadata["updated"] = h["updated"]
        else:
            # Get just DataCite metadata
            metadata = decustomize_schema(h["metadata"])

        if not dataset.create(collection, rid, metadata):
            err = dataset.error_message()
            print(err)
Beispiel #4
0
def test_frame(t, c_name):
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(err)
        return
    data = [{
        "id": "A",
        "one": "one",
        "two": 22,
        "three": 3.0,
        "four": ["one", "two", "three"]
    }, {
        "id": "B",
        "two": 2000,
        "three": 3000.1
    }, {
        "id": "C"
    }, {
        "id": "D",
        "one": "ONE",
        "two": 20,
        "three": 334.1,
        "four": []
    }]
    keys = []
    dot_paths = ["._Key", ".one", ".two", ".three", ".four"]
    labels = ["_Key", "one", "two", "three", "four"]
    for row in data:
        key = row['id']
        keys.append(key)
        dataset.create(c_name, key, row)
    f_name = 'f1'
    if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False:
        err = dataset.error_message()
        t.error(err)
    if dataset.frame_reframe(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(err)
    l = dataset.frames(c_name)
    if len(l) != 1 or l[0] != 'f1':
        t.error(f"expected one frame name, f1, got {l}")
    if dataset.delete_frame(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'delete_frame({c_name}, {f_name}), {err}')
Beispiel #5
0
def write_records(records, collection):
    for r in records:
        key = r["UID"]
        if dataset.has_key(collection, key) == False:
            print(key)
            err = dataset.create(collection, key, r)
            if err != "":
                print(f"Unexpected error on create: {err}")
Beispiel #6
0
def test_issue32(t, collection_name):
    if dataset.create(collection_name, "k1", {"one": 1}) == False:
        err = dataset.error_message()
        t.error("Failed to create k1 in", collection_name, ', ', err)
        return
    if dataset.has_key(collection_name, "k1") == False:
        t.error("Failed, has_key k1 should return", True)
    if dataset.has_key(collection_name, "k2") == True:
        t.error("Failed, has_key k2 should return", False)
Beispiel #7
0
def test_check_repair(t, collection_name):
    t.print("Testing status on", collection_name)
    # Make sure we have a left over collection to check and repair
    if os.path.exists(collection_name) == True:
        shutil.rmtree(collection_name)
    if dataset.status(collection_name) == True:
        dataset.close(collection_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'init({collection_name}) failed, {err}')
        return
    if dataset.status(collection_name) == False:
        t.error(
            f"Failed, expected dataset.status() == True, got False for {collection_name}"
        )
        return

    if dataset.has_key(collection_name, 'one') == False:
        if dataset.create(collection_name, 'one', {"one": 1}) == False:
            err = dataset.error_message()
            t.error(
                f'create({collection_name}, "one", {"one": 1}) failed, {err}')
    t.print(f"Testing check on {collection_name}")
    # Check our collection
    if not (dataset.check(collection_name) == True):
        err = dataset.error_message()
        t.error(
            "Failed, (before break) expected check True, got False for {collection_name} (err: {err})"
        )
        return

    # Break and recheck our collection
    print(f"Removing {collection_name}/collection.json to cause a fail")
    if os.path.exists(collection_name + "/collection.json"):
        os.remove(collection_name + "/collection.json")
    print(f"Testing check on (broken) {collection_name}")
    if not (dataset.check(collection_name) == False):
        err = dataset.error_message()
        t.error(
            f"Failed, (after break) expected check False got True for {collection_name} (err: {err})"
        )
    else:
        t.print(f"Should have see error output for broken {collection_name}")

    # Repair our collection
    t.print("Testing repair on", collection_name)
    if dataset.repair(collection_name) == False:
        err = dataset.error_message()
        t.error("Failed, expected repair to return True, got, ", err)
    if os.path.exists(os.path.join(collection_name,
                                   "collection.json")) == False:
        t.error(
            f"Failed, expected recreated {collection_name}/collection.json")
Beispiel #8
0
def make_link_history(collection, resolver, url, note):
    """Make an entry in our link history collection"""
    now = datetime.today().isoformat()
    # Run checks on both resoler and final URL
    try:
        target = requests.get(url)
    except requests.exceptions.ConnectionError:
        target = requests.Response()
        target.status_code = 404
        target.url = ''
    if target.status_code != 200:
        print(
            f"Target URL {url} returns Error status code {target.status_code}")
    if links_differ(target.url, url):
        print(f"Target URL '{url}' redirects to '{target.url}'")
    try:
        get = requests.get(f"https://resolver.library.caltech.edu/{resolver}")
    except requests.exceptions.ConnectionError:
        get = requests.Response()
        get.status_code = 404
        get.url = ''
    if links_differ(get.url, url):
        print(f"Mismatch between expected url '{url}' and actual '{get.url}'")
    if get.status_code != 200:
        print(
            f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}"
        )
    entry = {
        "expected-url": url,
        "url": get.url,
        "modified": now,
        "code": get.status_code,
        "note": note,
    }
    # If existing, push into history
    if dataset.has_key(collection, resolver):
        existing, err = dataset.read(collection, resolver)
        if err != "":
            print(err)
            exit()
        if save_history(existing, url, get):
            past_history = existing.pop("history")
            past_history.append(existing)
            entry["history"] = past_history
            if not dataset.update(collection, resolver, entry):
                print(dataset.error_message())
                exit()
    else:
        entry["history"] = []
        if not dataset.create(collection, resolver, entry):
            print(dataset.error_message())
            exit()
Beispiel #9
0
def get_history(collection, caltechdata_collection, caltechdata_keys):
    """Harvest the history of records from CaltechDATA."""

    keys_to_update = []
    if os.path.exists("historyupdate"):
        with open("historyupdate", "r") as infile:
            update = date.fromisoformat(infile.read())
    else:
        # Arbitrary old date - everything will be updated
        update = date(2011, 1, 1)
    for k in progressbar(caltechdata_keys, redirect_stdout=True):
        existing, err = dataset.read(caltechdata_collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        record_update = datetime.fromisoformat(existing["updated"]).date()
        if record_update > update:
            keys_to_update.append(k)

    if not os.path.isdir(collection):
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = "https://data.caltech.edu/records/"

    for k in progressbar(keys_to_update):
        url = base_url + str(k) + "/revisions"
        response = requests.get(url)
        revisions = response.json()
        for num, metadata in enumerate(revisions):
            key = f"{k}-{num}"
            if dataset.has_key(collection, key) == False:
                dataset.create(collection, key, metadata)

    # Save date in file
    today = date.today().isoformat()
    with open("historyupdate", "w") as outfile:
        outfile.write(today)
Beispiel #10
0
def test_join(t, collection_name):
    key = "test_join1"
    obj1 = {"one": 1}
    obj2 = {"two": 2}
    if dataset.status(collection_name) == False:
        t.error("Failed, collection status is False,", collection_name)
        return
    ok = dataset.has_key(collection_name, key)
    err = ''
    if ok == True:
        ok = dataset.update(collection_nane, key, obj1)
    else:
        ok = dataset.create(collection_name, key, obj1)
    if ok == False:
        err = dataset.error_message()
        t.error(
            f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}'
        )
        return
    if dataset.join(collection_name, key, obj2, overwrite=False) == False:
        err = dataset.error_message()
        t.error(
            f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    if obj_result.get('one') != 1:
        t.error(f'Failed to join append key {key}, {obj_result}')
    if obj_result.get("two") != 2:
        t.error(f'Failed to join append key {key}, {obj_result}')
    obj2['one'] = 3
    obj2['two'] = 3
    obj2['three'] = 3
    if dataset.join(collection_name, key, obj2, overwrite=True) == False:
        err = dataset.error_message()
        t.error(
            f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    for k in obj_result:
        if k != '_Key' and obj_result[k] != 3:
            t.error('Failed to update value in join overwrite', k, obj_result)
Beispiel #11
0
def build_aggregate(collection):
    """Build a collection for usage by month.
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    # Find time periods
    start = datetime.fromisoformat("2017-01-01")
    today = datetime.today().date().isoformat()
    date_list = pd.date_range(start, today, freq="MS").strftime("%Y-%m").to_list()

    for month in date_list:
        if not dataset.create(collection, month, {"report-datasets": []}):
            err = dataset.error_message()
            print(err)
Beispiel #12
0
def get_wos_refs(new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "all_wos.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        ok = dataset.init(collection)
        if ok == False:
            print("Dataset failed to init collection")
            exit()

    # Get access token from WOS sed as environment variable with source token.bash
    token = os.environ["WOSTOK"]

    headers = {"X-ApiKey": token, "Content-type": "application/json"}

    # Run query to get scope of records

    base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK"

    collected = dataset.has_key(collection, "captured")

    if collected == True:
        date = dataset.read(collection, "captured")
        date = date[0]["captured"]
        date = datetime.fromisoformat(date)
        current = datetime.today()
        diff = current - date
        base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D"

    date = datetime.today().isoformat()
    record = {"captured": date}
    if dataset.has_key(collection, "captured"):
        err = dataset.update(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on update: {err}")
    else:
        err = dataset.create(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on create: {err}")

    query = "OG=(California Institute of Technology)"
    query = urllib.parse.quote_plus(query)
    url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1"

    response = requests.get(url, headers=headers)
    response = response.json()
    record_count = response["QueryResult"]["RecordsFound"]
    print(record_count, " Records from WOS")
    query_id = response["QueryResult"]["QueryID"]
    try:
        records = response["Data"]["Records"]["records"]["REC"]
    except:
        print(response)
    write_records(records, collection)
    # We have saved the first 100 records
    record_start = 101
    record_count = record_count - 100

    query_url = "https://api.clarivate.com/api/wos/query/"

    while record_count > 0:
        print(record_count)
        print(len(records), "records")
        if record_count > 100:
            url = (
                query_url
                + str(query_id)
                + "?count=100&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            try:
                records = response["Records"]["records"]["REC"]
            except:
                print(response)
            write_records(records, collection)
            record_start = record_start + 100
            record_count = record_count - 100
        else:
            url = (
                query_url
                + str(query_id)
                + "?count="
                + str(record_count)
                + "&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            records = response["Records"]["records"]["REC"]
            write_records(records, collection)
            record_count = 0

    print("Downloaded all records ")
Beispiel #13
0
def test_sync_csv(t, c_name):
    # Setup test collection
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}) failed, {err}')
        return

    # Setup test CSV instance
    t_data = [{
        "_Key": "one",
        "value": 1
    }, {
        "_Key": "two",
        "value": 2
    }, {
        "_Key": "three",
        "value": 3
    }]
    csv_name = c_name.strip(".ds") + ".csv"
    if os.path.exists(csv_name):
        os.remove(csv_name)
    with open(csv_name, 'w') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"])
        csv_writer.writeheader()
        for obj in t_data:
            csv_writer.writerow(obj)

    # Import CSV into collection
    if dataset.import_csv(c_name, csv_name, True) == False:
        err = dataset.error_message()
        t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}')
        return
    for key in ["one", "two", "three"]:
        if dataset.has_key(c_name, key) == False:
            t.error(f"expected has_key({key}) == True, got False")
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key('five') == False, got True")
    if dataset.create(c_name, "five", {"value": 5}) == False:
        err = dataset.error_message()
        t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}')
        return

    # Setup frame
    frame_name = 'test_sync'
    keys = dataset.keys(c_name)
    if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"],
                            ["_Key", "value"]) == False:
        err = dataset.error_message()
        t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}')
        return

    #NOTE: Tests for sync_send_csv and sync_receive_csv
    if dataset.sync_send_csv(c_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}')
        return
    with open(csv_name) as fp:
        src = fp.read()
        if 'five' not in src:
            t.error(f"expected 'five' in src, got {src}")

    # Now remove "five" from collection
    if dataset.delete(c_name, "five") == False:
        err = dataset.error_message()
        t.error(f'delete({c_name}, "five") failed, {err}')
        return
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key(five) == False, got True")
        return
    if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False:
        err = dataset.error_message()
        t.error(
            f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}'
        )
        return
    if dataset.has_key(c_name, "five") == False:
        t.error(f"expected has_key(five) == True, got False")
        return
Beispiel #14
0
def get_crossref_refs(prefix, done=False, new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "crossref_refs.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = (
        "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix="
        + prefix)

    collected = dataset.has_key(collection, "captured")

    cursor = ""
    count = 0
    while cursor != None:
        if collected == True:
            date, err = dataset.read(collection, "captured")
            if err != "":
                print("error on read: " + err)
            date = date["captured"]
            print(date)
            url = base_url + "&from-collected-date=" + date
        else:
            url = base_url
        if cursor != "":
            url = url + "&cursor=" + cursor
        print(url)
        r = requests.get(url)
        records = r.json()
        if records["status"] == "failed":
            print(records)
            break
        for rec in records["message"]["events"]:
            # Save results in dataset
            print(count, rec["id"])
            count = count + 1  # Just for prettyness
            if not dataset.create(collection, rec["id"], rec):
                err = dataset.error_message()
                print("Error in saving record: " + err)

        if cursor == records["message"]["next-cursor"]:
            # Catches bug where we get the same curser back at end of results
            break
        if records["message"]["total-results"] > count:
            cursor = records["message"]["next-cursor"]
        else:
            cursor = None

    if collected == True:
        date, err = dataset.read(collection, "captured")
        if err != "":
            print("Error in reading date: " + err)
        date = date["captured"]

        # Check Deleted
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Delete results in dataset
                print("Deleted: ", rec["id"])
                if not dataset.delete(collection, rec["id"]):
                    err = dataset.error_message()
                    print(f"Unexpected error on read: {err}")
            cursor = records["message"]["next-cursor"]

        # Check Edited
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Update results in dataset
                print("Update: ", rec["id"])
                if not dataset.update(collection, rec["id"], rec):
                    err = dataset.error_message()
                    print(f"Unexpected error on write: {err}")
            cursor = records["message"]["next-cursor"]

    if done:
        date = datetime.date.today().isoformat()
        record = {"captured": date}
        if dataset.has_key(collection, "captured"):
            if not dataset.update(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on update: {err}")
        else:
            if not dataset.create(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on create: {err}")
Beispiel #15
0
            log.print(f"Ingesting {f_name}")
            metadata = frontmatter(f_name)
            with open(f_name) as f:
                src = f.read()
            if "id" in metadata:
                key = str(metadata["id"])
                if dataset.has_key(c_name, key):
                    err = dataset.update(c_name, key, {
                        "metadata": metadata,
                        "content": f_name,
                        "src": src
                    })
                else:
                    err = dataset.create(c_name, key, {
                        "metadata": metadata,
                        "content": f_name,
                        "src": src
                    })
                if err != "":
                    log.fatal(err)
            else:
                log.print(f"Warning, no front matter for {f_name}")

# for each dataset record render appropriate HTML pages
keys = dataset.keys(c_name)
for key in keys:
    page, err = dataset.read(c_name, key)
    if err != "":
        log.print(f"WARNING: could not read {key} from {c_name}, skipping")
    if 'output' in page['metadata']:
        p = page['metadata']['output']
Beispiel #16
0
def test_keys(t, collection_name):
    '''test_keys(collection_name) test getting, filter and sorting keys'''
    # Test count after delete
    key_list = dataset.keys(collection_name)
    cnt = dataset.count(collection_name)
    if cnt != 0:
        t.error("Failed, expected zero records, got", cnt, key_list)

    #
    # Generate multiple records for collection for testing keys
    #
    test_records = {
        "gutenberg:21489": {
            "title": "The Secret of the Island",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "http://www.gutenberg.org/ebooks/21489",
            "categories": "fiction, novel"
        },
        "gutenberg:2488": {
            "title":
            "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "https://www.gutenberg.org/ebooks/2488",
            "categories": "fiction, novel"
        },
        "gutenberg:21839": {
            "title": "Sense and Sensibility",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jane",
                "family": "Austin"
            }],
            "url": "http://www.gutenberg.org/ebooks/21839",
            "categories": "fiction, novel"
        },
        "gutenberg:3186": {
            "title": "The Mysterious Stranger, and Other Stories",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Mark",
                "family": "Twain"
            }],
            "url": "http://www.gutenberg.org/ebooks/3186",
            "categories": "fiction, short story"
        },
        "hathi:uc1321060001561131": {
            "title":
            "A year of American travel - Narrative of personal experience",
            "formats": ["pdf"],
            "authors": [{
                "given": "Jessie Benton",
                "family": "Fremont"
            }],
            "url":
            "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9",
            "categories": "non-fiction, memoir"
        }
    }
    test_count = len(test_records)

    for k in test_records:
        v = test_records[k]
        if dataset.create(collection_name, k, v) == False:
            err = dataset.error_message()
            t.error("Failed, could not add", k, "to", collection_name, ', ',
                    err)

    # Test keys, filtering keys and sorting keys
    all_keys = dataset.keys(collection_name)
    if len(all_keys) != test_count:
        t.error("Expected", test_count, "all_keys back, got", keys)

    #dataset.verbose_on()
    filter_expr = '(eq .categories "non-fiction, memoir")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 1:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for",
            filter_expr, "got", filtered_keys)

    filter_expr = '(contains .categories "novel")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 3:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for",
            filter_expr, "got", filtered_keys)

    sort_expr = '+.title'
    filter_expr = '(contains .categories "novel")'
    sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr)
    if len(sorted_keys) != 3:
        t.error(
            f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for",
            filter_expr, "got", sorted_keys)
    expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"]
    for i, k in enumerate(expected_keys):
        if i < len(sorted_keys) and sorted_keys[i] != k:
            obj1, _ = dataset.read(collection_name, k)
            obj2, _ = dataset.read(collection_name, sorted_keys[i])
            t.error(
                f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")'
            )
Beispiel #17
0
def test_basic(t, collection_name):
    '''test_basic(collection_name) runs tests on basic CRUD ops'''
    # Setup a test record
    key = "2488"
    value = {
        "title":
        "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
        "formats": ["epub", "kindle", "plain text"],
        "authors": [{
            "given": "Jules",
            "family": "Verne"
        }],
        "url": "https://www.gutenberg.org/ebooks/2488"
    }

    # We should have an empty collection, we will create our test record.
    if dataset.create(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f'create({collection_name}, {key}, {value}) failed, {err}')
        return

    # Check to see that we have only one record
    key_count = dataset.count(collection_name)
    if key_count != 1:
        t.error(f"Failed, expected count to be 1, got {key_count}")

    # Do a minimal test to see if the record looks like it has content
    keyList = dataset.keys(collection_name)
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error(f"epxected {rec[k]} got {v}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error(f"Failed, expected {k} with list v, got {v}")

    # Test updating record
    value["verified"] = True
    if dataset.update(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f"update({collection_name}, {key}, {value}) failed, {err}")
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error("expected {rec[k]} got {v} for key {k}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error("Failed, expected {k} with a list for v, got {v}")

    # Test path to record
    expected_s = "/".join(
        [collection_name, "pairtree", "24", "88", (key + ".json")])
    expected_l = len(expected_s)
    p = dataset.path(collection_name, key)
    if len(p) != expected_l:
        t.error("Failed, expected length", expected_l, "got", len(p))
    if p != expected_s:
        t.error("Failed, expected", expected_s, "got", p)

    # Test listing records
    l = dataset.list(collection_name, [key])
    if len(l) != 1:
        t.error(
            f"list({collection_name}, [{key}]) failed, list should return an array of one record, got",
            l)
        return

    # test deleting a record
    if dataset.delete(collection_name, key) == False:
        err = dataset.error_message()
        t.error("Failed, could not delete record", key, ", ", err)
Beispiel #18
0
                                                 dupe.affiliations)
        if subject.years not in dupe.years:
            dupe.years += subject.years
        if subject.names not in dupe.names:
            dupe.names += subject.names
        if subject.links not in dupe.links:
            dupe.links += subject.links

print("Total collaborators: ", len(deduped))

collab = 'collaborators.ds'

subprocess.run(['rm', '-rf', collab])
dataset.init(collab)
for d in deduped:
    dataset.create(collab, d.ca_id, d.write())
#Export to Google Sheet
os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json"

#Google sheet ID for output
f_name = 'frm'
sheet_name = "Sheet1"
sheet_range = "A1:CZ"
export_list = [".names", ".years", ".affiliations", ".links"]
title_list = ["name", "years", "affiliations", "links"]
keys = dataset.keys(collab)
if dataset.has_frame(collab, f_name):
    dataset.delete_frame(collab, f_name)
frame, err = dataset.frame(collab, f_name, keys, export_list, title_list)
if err != '':
    print(err)
Beispiel #19
0
def test_issue43(t, collection_name, csv_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    if os.path.exists(csv_name):
        os.remove(csv_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'Failed, need a {collection_name} to run test, {err}')
        return
    table = {
        "r1": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        },
        "r2": {
            "c1": "one",
            "c3": "three",
            "c4": "four"
        },
        "r3": {
            "c1": "one",
            "c2": "two",
            "c4": "four"
        },
        "r4": {
            "c1": "one",
            "c2": "two",
            "c3": "three"
        },
        "r5": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        }
    }
    for key in table:
        row = table[key]
        if dataset.create(collection_name, key, row) == False:
            err = dataset.error_message()
            t.error(f"Can't add test row {key} to {collection_name}, {err}")
            return

    dataset.use_strict_dotpath(False)
    # Setup frame
    frame_name = 'f1'
    keys = dataset.keys(collection_name)
    if dataset.frame_create(collection_name, frame_name, keys,
                            ["._Key", ".c1", ".c2", ".c3", ".c4"],
                            ["_Key", "c1", "c2", "c3", "c4"]) == False:
        err = dataset.error_message()
        t.error(err)
        return
    if dataset.export_csv(collection_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}'
        )
        return
    with open(csv_name, mode='r', encoding='utf-8') as f:
        rows = f.read()

    for row in rows.split('\n'):
        if len(row) > 0:
            cells = row.split(',')
            if len(cells) < 5:
                t.error(f'row error {csv_name} for {cells}')
                    #We're just going to use the the first other identifier as filler
                    if link == '':
                        link = idv['value']
        else:
            #Just one identifier
            if identifiers['type'] == 'xref_doi':
                link = 'https://doi.org/' + idv['value']
            elif identifiers['type'] == 'doi':
                link = 'https://doi.org/' + idv['value']
            else:
                link = idv['value']

        record =\
                {'id':uid,'title':title,'journal':journal,'authors':author_list,'identifiers':identifier_list,'affiliations':affiliation_list,'link':link,'year':publication_date.year}

        dataset.create(collection, link, record)

#Export to Google Sheet
os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json"

#Google sheet ID for output
sheet_name = "Sheet1"
sheet_range = "A1:CZ"
f_name = 'f_name'
export_list = [".link", ".title", ".journal", ".year"]
title_list = ["link", "title", "journal", "year"]
keys = dataset.keys(collection)
if dataset.has_frame(collection, f_name):
    dataset.delete_frame(collection, f_name)
frame, err = dataset.frame(collection, f_name, keys, export_list, title_list)
if err != '':
Beispiel #21
0
def test_frame_objects(t, c_name):
    if dataset.status(c_name) == True:
        dataset.close(c_name)
        if os.path.exists(c_name):
            shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}), {err}')
        return
    data = [{
        "id":
        "A",
        "nameIdentifiers": [{
            "nameIdentifier": "0000-000X-XXXX-XXXX",
            "nameIdentifierScheme": "ORCID",
            "schemeURI": "http://orcid.org/"
        }, {
            "nameIdentifier": "H-XXXX-XXXX",
            "nameIdentifierScheme": "ResearcherID",
            "schemeURI": "http://www.researcherid.com/rid/"
        }],
        "two":
        22,
        "three":
        3.0,
        "four": ["one", "two", "three"]
    }, {
        "id": "B",
        "two": 2000,
        "three": 3000.1
    }, {
        "id": "C"
    }, {
        "id":
        "D",
        "nameIdentifiers": [{
            "nameIdentifier": "0000-000X-XXXX-XXXX",
            "nameIdentifierScheme": "ORCID",
            "schemeURI": "http://orcid.org/"
        }],
        "two":
        20,
        "three":
        334.1,
        "four": []
    }]
    keys = []
    dot_paths = [
        "._Key", ".nameIdentifiers", ".nameIdentifiers[:].nameIdentifier",
        ".two", ".three", ".four"
    ]
    labels = [
        "id", "nameIdentifiers", "nameIdentifier", "two", "three", "four"
    ]
    for row in data:
        key = row['id']
        keys.append(key)
        err = dataset.create(c_name, key, row)
    f_name = 'f1'
    if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False:
        err = dataset.error_message()
        t.error(
            f'frame_create({c_name}, {f_name}, {keys}, {dot_paths}, {labels}), {err}'
        )
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) != len(keys):
        t.error(f'expected {len(keys)}, got {len(f_keys)}')
    if dataset.frame_refresh(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'frame_reframe({c_name}, {f_name}), {err}')
    l = dataset.frames(c_name)
    if len(l) != 1 or l[0] != 'f1':
        t.error(f"expected one frame name, f1, got {l}")
    object_result = dataset.frame_objects(c_name, f_name)
    if len(object_result) != 4:
        t.error(
            f'Did not get correct number of objects back, expected 4 got {len(object_result)}, {object_result}'
        )
    count_nameId = 0
    count_nameIdObj = 0
    for obj in object_result:
        if 'id' not in obj:
            t.error('Did not get id in object')
        if 'nameIdentifiers' in obj:
            count_nameId += 1
            for idv in obj['nameIdentifiers']:
                if 'nameIdentifier' not in idv:
                    t.error('Missing part of object')
        if 'nameIdentifier' in obj:
            count_nameIdObj += 1
            if "0000-000X-XXXX-XXXX" not in obj['nameIdentifier']:
                t.error('Missing object in complex dot path')
    if count_nameId != 2:
        t.error(
            f"Incorrect number of nameIdentifiers elements, expected 2, got {count_nameId}"
        )
    if count_nameIdObj != 2:
        t.error(
            f"Incorrect number of nameIdentifier elements, expected 2, got {count_nameIdObj}"
        )
    if dataset.delete_frame(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'delete_frame({c_name}, {f_name}), {err}')
Beispiel #22
0
def read_records(data, current, collection):
    # read records in 'hits' structure
    for record in data:
        rid = str(record["id"])
        metadata = record["metadata"]
        download = False  # Flag for downloading files
        # Do we need to download?
        if "electronic_location_and_access" in metadata:
            # Get information about already backed up files:
            existing_size = []
            existing_names = []
            if rid in current:
                # Get existing files
                attachments = dataset.attachments(collection, rid)
                for a in attachments:
                    split = a.split(" ")
                    #Handle file names with spaces; size will always be last
                    size = split[-1]
                    name = a.replace(f' {size}', '')
                    existing_names.append(name)
                    existing_size.append(size)
            # Look at all files
            count = len(metadata["electronic_location_and_access"])
            dl = 0
            for erecord in metadata["electronic_location_and_access"]:
                # Check if file has been downloaded
                size = erecord["file_size"]
                name = erecord["electronic_name"][0]
                if size in existing_size and name in existing_names:
                    dl = dl + 1
            if dl == count:
                print(
                    "files already downloaded ",
                    existing_size,
                    existing_names,
                )
                download = False
            else:
                print("file mismatch ", existing_size, existing_names, dl,
                      count)
                download = True

        # Save results in dataset
        print("Saving record " + rid)

        if rid in current:
            print('Update')
            update = dataset.update(collection, rid, record)
            if update == False:
                print(
                    f"Failed, could not create record: {dataset.error_message()}"
                )
                exit()
        else:
            create = dataset.create(collection, rid, record)
            print('CREATED', create, rid)
            if create == False:
                print(
                    f"Failed, could not create record: {dataset.error_message()}"
                )
                exit()

        if download == True:
            files = []

            print("Downloading files for ", rid)

            for erecord in metadata["electronic_location_and_access"]:
                f = download_file(erecord, rid)
                if f != None:
                    files.append(f)

            print(files)
            print("Attaching files")

            if len(files) != 0:
                err = dataset.attach(collection, rid, files)
                if err == False:
                    print(f"Failed on attach {dataset.error_message()}")
                    exit()

            for f in files:
                if f != None:
                    os.remove(f)
Beispiel #23
0
def get_cd_github(new=True):

    collection = "github_records.ds"

    if new == True:
        os.system("rm -rf " + collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    url = "https://data.caltech.edu/api/records"

    response = requests.get(url + "/?size=1000&q=subjects:GitHub")
    hits = response.json()

    for h in hits["hits"]["hits"]:
        rid = str(h["id"])
        record = h["metadata"]

        result = dataset.has_key(collection, rid)

        if result == False:

            dataset.create(collection, rid, record)

            print("Downloading files for ", rid)

            codemeta = False

            for erecord in record["electronic_location_and_access"]:
                f = download_file(erecord, rid)

                # We're just looking for the zip file
                if f.split(".")[-1] == "zip":
                    zip_files = subprocess.check_output(
                        ["unzip", "-l", f.rstrip()],
                        universal_newlines=True).splitlines()
                    i = 4  # Ignore header
                    line = zip_files[i]
                    while line[0] != "-":
                        split = line.split("/")
                        fname = split[1]
                        if fname == "codemeta.json":
                            sp = line.split("   ")[-1]
                            os.system("unzip -j " + f.rstrip() + " " + sp +
                                      " -d .")
                            codemeta = True
                        i = i + 1
                        line = zip_files[i]
                        # Will only identify codemeta files in root of repo

                # Trash downloaded files - extracted codemeta.json not impacted
                print("Trash " + f)
                os.system("rm " + f)

            if codemeta == True:
                print(collection, rid)
                response = dataset.attach(collection, rid, ["codemeta.json"])
                print("Attachment ", response)
                os.system("rm codemeta.json")
                print("Trash codemeta.json")