コード例 #1
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_check_repair(t, collection_name):
    t.print("Testing status on", collection_name)
    # Make sure we have a left over collection to check and repair
    if os.path.exists(collection_name) == True:
        shutil.rmtree(collection_name)
    if dataset.status(collection_name) == True:
        dataset.close(collection_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'init({collection_name}) failed, {err}')
        return
    if dataset.status(collection_name) == False:
        t.error(
            f"Failed, expected dataset.status() == True, got False for {collection_name}"
        )
        return

    if dataset.has_key(collection_name, 'one') == False:
        if dataset.create(collection_name, 'one', {"one": 1}) == False:
            err = dataset.error_message()
            t.error(
                f'create({collection_name}, "one", {"one": 1}) failed, {err}')
    t.print(f"Testing check on {collection_name}")
    # Check our collection
    if not (dataset.check(collection_name) == True):
        err = dataset.error_message()
        t.error(
            "Failed, (before break) expected check True, got False for {collection_name} (err: {err})"
        )
        return

    # Break and recheck our collection
    print(f"Removing {collection_name}/collection.json to cause a fail")
    if os.path.exists(collection_name + "/collection.json"):
        os.remove(collection_name + "/collection.json")
    print(f"Testing check on (broken) {collection_name}")
    if not (dataset.check(collection_name) == False):
        err = dataset.error_message()
        t.error(
            f"Failed, (after break) expected check False got True for {collection_name} (err: {err})"
        )
    else:
        t.print(f"Should have see error output for broken {collection_name}")

    # Repair our collection
    t.print("Testing repair on", collection_name)
    if dataset.repair(collection_name) == False:
        err = dataset.error_message()
        t.error("Failed, expected repair to return True, got, ", err)
    if os.path.exists(os.path.join(collection_name,
                                   "collection.json")) == False:
        t.error(
            f"Failed, expected recreated {collection_name}/collection.json")
コード例 #2
0
def make_link_history(collection, resolver, url, note):
    """Make an entry in our link history collection"""
    now = datetime.today().isoformat()
    # Run checks on both resoler and final URL
    try:
        target = requests.get(url)
    except requests.exceptions.ConnectionError:
        target = requests.Response()
        target.status_code = 404
        target.url = ''
    if target.status_code != 200:
        print(
            f"Target URL {url} returns Error status code {target.status_code}")
    if links_differ(target.url, url):
        print(f"Target URL '{url}' redirects to '{target.url}'")
    try:
        get = requests.get(f"https://resolver.library.caltech.edu/{resolver}")
    except requests.exceptions.ConnectionError:
        get = requests.Response()
        get.status_code = 404
        get.url = ''
    if links_differ(get.url, url):
        print(f"Mismatch between expected url '{url}' and actual '{get.url}'")
    if get.status_code != 200:
        print(
            f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}"
        )
    entry = {
        "expected-url": url,
        "url": get.url,
        "modified": now,
        "code": get.status_code,
        "note": note,
    }
    # If existing, push into history
    if dataset.has_key(collection, resolver):
        existing, err = dataset.read(collection, resolver)
        if err != "":
            print(err)
            exit()
        if save_history(existing, url, get):
            past_history = existing.pop("history")
            past_history.append(existing)
            entry["history"] = past_history
            if not dataset.update(collection, resolver, entry):
                print(dataset.error_message())
                exit()
    else:
        entry["history"] = []
        if not dataset.create(collection, resolver, entry):
            print(dataset.error_message())
            exit()
コード例 #3
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_setup(t, collection_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error("init({collection_name}) failed, {err}")
        return
コード例 #4
0
def get_caltechdata(collection, production=True, datacite=False):
    """Harvest all records from CaltechDATA .
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    if production == True:
        url = "https://data.caltech.edu/api/records"
    else:
        url = "https://cd-sandbox.tind.io/api/records"

    response = requests.get(url + "/?size=9000")
    hits = response.json()

    print(hits)
    for h in progressbar(hits["hits"]["hits"]):
        rid = str(h["id"])
        # Get enriched metadata records (including files)
        if datacite == False:
            metadata = decustomize_schema(h["metadata"], True, True, True)
            metadata["updated"] = h["updated"]
        else:
            # Get just DataCite metadata
            metadata = decustomize_schema(h["metadata"])

        if not dataset.create(collection, rid, metadata):
            err = dataset.error_message()
            print(err)
コード例 #5
0
ファイル: usage.py プロジェクト: caltechlibrary/ames
def build_usage(caltechdata_collection, usage_collection):
    """Build collection of records that contain CaltechDATA usage
    information"""
    if not os.path.isdir(usage_collection):
        if not dataset.init(usage_collection):
            print("Dataset failed to init collection")
            exit()
        # Write date to start collecting statistics for new collection
        dataset.create(usage_collection, "end-date", {"end-date": 1485907200})
    # Build out structure for all CaltechDATA records
    ids = dataset.keys(caltechdata_collection)
    for k in ids:
        if dataset.has_key(usage_collection, k) == False:
            metadata, err = dataset.read(caltechdata_collection, k)
            # When record was submitted to CaltechDATA:
            rdate = None
            submitted = None
            issued = None
            if "dates" in metadata:
                doi = metadata["identifier"]["identifier"]
                for date in metadata["dates"]:
                    if date["dateType"] == "Submitted":
                        rdate = date["date"]
                    if date["dateType"] == "Updated":
                        submitted = date["date"]
                    if date["dateType"] == "Issued":
                        issued = date["date"]
                if rdate == None:
                    if submitted != None:
                        rdate = submitted
                    else:
                        rdate = issued
            else:
                # Dummy values for junk records
                rdate = "2020-04-01"
                doi = ""
            # Dataset is the only supported type in the spec and we are
            # following the dataset standards for usage
            # All dates are the date added to CaltechDATA, which is
            # the apropriate 'publication' date even if content was available
            # earlier
            record_data = {
                "dataset-id": [{"type": "doi", "value": doi}],
                "uri": "https://data.caltech.edu/records/" + k,
                "publisher": "CaltechDATA",
                "platform": "CaltechDATA",
                "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}],
                "yop": rdate.split("-")[0],
                "data-type": "dataset",
                "dataset-dates": [{"type": "pub-date", "value": rdate}],
                "dataset-title": metadata["titles"][0]["title"],
                "performance": [],
                "grand-total-unique-investigations": 0,
                "grand-total-unique-requests": 0,
            }
            if not dataset.create(usage_collection, k, record_data):
                err = dataset.error_message()
                print(err)
                exit()
コード例 #6
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_frame(t, c_name):
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(err)
        return
    data = [{
        "id": "A",
        "one": "one",
        "two": 22,
        "three": 3.0,
        "four": ["one", "two", "three"]
    }, {
        "id": "B",
        "two": 2000,
        "three": 3000.1
    }, {
        "id": "C"
    }, {
        "id": "D",
        "one": "ONE",
        "two": 20,
        "three": 334.1,
        "four": []
    }]
    keys = []
    dot_paths = ["._Key", ".one", ".two", ".three", ".four"]
    labels = ["_Key", "one", "two", "three", "four"]
    for row in data:
        key = row['id']
        keys.append(key)
        dataset.create(c_name, key, row)
    f_name = 'f1'
    if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False:
        err = dataset.error_message()
        t.error(err)
    if dataset.frame_reframe(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(err)
    l = dataset.frames(c_name)
    if len(l) != 1 or l[0] != 'f1':
        t.error(f"expected one frame name, f1, got {l}")
    if dataset.delete_frame(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'delete_frame({c_name}, {f_name}), {err}')
コード例 #7
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_join(t, collection_name):
    key = "test_join1"
    obj1 = {"one": 1}
    obj2 = {"two": 2}
    if dataset.status(collection_name) == False:
        t.error("Failed, collection status is False,", collection_name)
        return
    ok = dataset.has_key(collection_name, key)
    err = ''
    if ok == True:
        ok = dataset.update(collection_nane, key, obj1)
    else:
        ok = dataset.create(collection_name, key, obj1)
    if ok == False:
        err = dataset.error_message()
        t.error(
            f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}'
        )
        return
    if dataset.join(collection_name, key, obj2, overwrite=False) == False:
        err = dataset.error_message()
        t.error(
            f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    if obj_result.get('one') != 1:
        t.error(f'Failed to join append key {key}, {obj_result}')
    if obj_result.get("two") != 2:
        t.error(f'Failed to join append key {key}, {obj_result}')
    obj2['one'] = 3
    obj2['two'] = 3
    obj2['three'] = 3
    if dataset.join(collection_name, key, obj2, overwrite=True) == False:
        err = dataset.error_message()
        t.error(
            f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    for k in obj_result:
        if k != '_Key' and obj_result[k] != 3:
            t.error('Failed to update value in join overwrite', k, obj_result)
コード例 #8
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_issue32(t, collection_name):
    if dataset.create(collection_name, "k1", {"one": 1}) == False:
        err = dataset.error_message()
        t.error("Failed to create k1 in", collection_name, ', ', err)
        return
    if dataset.has_key(collection_name, "k1") == False:
        t.error("Failed, has_key k1 should return", True)
    if dataset.has_key(collection_name, "k2") == True:
        t.error("Failed, has_key k2 should return", False)
コード例 #9
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_issue12(t, c_name):
    src = '''[
{"id": "1", "c1": 1, "c2": 2, "c3": 3 },
{"id": "2", "c1": 2, "c2": 2, "c3": 3 },
{"id": "3", "c1": 3, "c2": 3, "c3": 3 },
{"id": "4", "c1": 1, "c2": 1, "c3": 1 },
{"id": "5", "c1": 6, "c2": 6, "c3": 6 }
]'''
    #dataset.verbose_on() # DEBUG
    #dataset.use_strict_dotpath(True) # DEBUG
    if dataset.status(c_name) == False:
        if not dataset.init(c_name):
            err = dataset.error_message()
            t.error(f'failed to create {c_name}')
            return
    objects = json.loads(src)
    for obj in objects:
        key = obj['id']
        if dataset.has_key(c_name, key):
            dataset.update(c_name, key, obj)
        else:
            dataset.create(c_name, key, obj)
    f_names = dataset.frames(c_name)
    for f_name in f_names:
        ok = dataset.delete_frame(c_name, f_name)
        if ok == False:
            err = dataset.error_message()
            t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"')
            return
        if dataset.has_frame(c_name, f_name) == True:
            t.error(
                f'Failed to delete frame {c_name} from {c_name}, frame still exists'
            )
            return
    f_name = 'issue12'
    dot_paths = [".c1", "c3"]
    labels = [".col1", ".col3"]
    keys = dataset.keys(c_name)
    if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels):
        err = dataset.error_message()
        t.error(f'failed to create {f_name} from {c_name}, {err}')
    if not dataset.has_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected frame {f_name} to exists, {err}')
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) == 0:
        err = dataset.error_message()
        t.error(f'expected keys in {f_name}, got zero, {err}')
        return
    f_objects = dataset.frame_objects(c_name, f_name)
    if len(f_objects) == 0:
        err = dataset.error_message()
        t.error(f'expected objects in {f_name}, got zero, {err}')
        return
    if not dataset.delete_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected to delete {f_name} in {c_name}, {err}')
コード例 #10
0
def get_records(dot_paths, f_name, d_name, keys, labels=None, clear=True):
    if dataset.has_frame(d_name, f_name):
        if clear:
            dataset.delete_frame(d_name, f_name)
        else:
            dataset.frame_refresh(d_name, f_name)
            return dataset.frame_objects(d_name, f_name)
    if labels:
        if not dataset.frame_create(d_name, f_name, keys, dot_paths, labels):
            err = dataset.error_message()
            print(f"ERROR: Can't create {f_name} in {d_name}, {err}")
    else:
        # If labels arn't provided, just base on dot path
        labels = []
        for d in dot_paths:
            labels.append(d.split(".")[-1])
        if not dataset.frame_create(d_name, f_name, keys, dot_paths, labels):
            err = dataset.error_message()
            print(f"ERROR: Can't create {f_name} in {d_name}, {err}")
    return dataset.frame_objects(d_name, f_name)
コード例 #11
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_clone_sample(t, c_name, sample_size, training_name, test_name):
    if os.path.exists(training_name):
        shutil.rmtree(training_name)
    if os.path.exists(test_name):
        shutil.rmtree(test_name)
    if dataset.clone_sample(c_name, training_name, test_name,
                            sample_size) == False:
        err = dataset.error_message()
        t.error(
            f"can't clone sample {c_name} size {sample_size} into {training_name}, {test_name} error {err}"
        )
コード例 #12
0
ファイル: usage.py プロジェクト: caltechlibrary/ames
def build_aggregate(collection):
    """Build a collection for usage by month.
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    # Find time periods
    start = datetime.fromisoformat("2017-01-01")
    today = datetime.today().date().isoformat()
    date_list = pd.date_range(start, today, freq="MS").strftime("%Y-%m").to_list()

    for month in date_list:
        if not dataset.create(collection, month, {"report-datasets": []}):
            err = dataset.error_message()
            print(err)
コード例 #13
0
ファイル: caltechdata.py プロジェクト: caltechlibrary/ames
def match_codemeta():
    collection = "github_records.ds"
    keys = dataset.keys(collection)
    for k in keys:
        existing, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        if "completed" not in existing:
            print("Processing new record ", k)
            if dataset.attachments(collection, k) != "":
                dataset.detach(collection, k)

                # Update CaltechDATA
                token = os.environ["TINDTOK"]

                infile = open("codemeta.json", "r")
                try:
                    meta = json.load(infile)
                except:
                    print("Invalid json file - Skipping forever ", k)
                else:
                    standardized = codemeta_to_datacite(meta)

                    # Check that all records have a GitHub subject tag
                    add = True
                    for s in standardized["subjects"]:
                        if s["subject"] == "Github":
                            add = False
                        if s["subject"] == "GitHub":
                            add = False
                    if add == True:
                        standardized["subjects"].append({"subject": "GitHub"})
                    response = caltechdata_edit(token, k, standardized, {}, {}, True)
                    print(response)
                os.system("rm codemeta.json")

            existing["completed"] = "True"
            if not dataset.update(collection, k, existing):
                err = dataset.error_message()
                print(f"Unexpected error on read: {err}")
コード例 #14
0
ファイル: crossref_refs.py プロジェクト: caltechlibrary/ames
def get_crossref_refs(prefix, done=False, new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "crossref_refs.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = (
        "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix="
        + prefix)

    collected = dataset.has_key(collection, "captured")

    cursor = ""
    count = 0
    while cursor != None:
        if collected == True:
            date, err = dataset.read(collection, "captured")
            if err != "":
                print("error on read: " + err)
            date = date["captured"]
            print(date)
            url = base_url + "&from-collected-date=" + date
        else:
            url = base_url
        if cursor != "":
            url = url + "&cursor=" + cursor
        print(url)
        r = requests.get(url)
        records = r.json()
        if records["status"] == "failed":
            print(records)
            break
        for rec in records["message"]["events"]:
            # Save results in dataset
            print(count, rec["id"])
            count = count + 1  # Just for prettyness
            if not dataset.create(collection, rec["id"], rec):
                err = dataset.error_message()
                print("Error in saving record: " + err)

        if cursor == records["message"]["next-cursor"]:
            # Catches bug where we get the same curser back at end of results
            break
        if records["message"]["total-results"] > count:
            cursor = records["message"]["next-cursor"]
        else:
            cursor = None

    if collected == True:
        date, err = dataset.read(collection, "captured")
        if err != "":
            print("Error in reading date: " + err)
        date = date["captured"]

        # Check Deleted
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Delete results in dataset
                print("Deleted: ", rec["id"])
                if not dataset.delete(collection, rec["id"]):
                    err = dataset.error_message()
                    print(f"Unexpected error on read: {err}")
            cursor = records["message"]["next-cursor"]

        # Check Edited
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Update results in dataset
                print("Update: ", rec["id"])
                if not dataset.update(collection, rec["id"], rec):
                    err = dataset.error_message()
                    print(f"Unexpected error on write: {err}")
            cursor = records["message"]["next-cursor"]

    if done:
        date = datetime.date.today().isoformat()
        record = {"captured": date}
        if dataset.has_key(collection, "captured"):
            if not dataset.update(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on update: {err}")
        else:
            if not dataset.create(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on create: {err}")
コード例 #15
0
ファイル: caltechdata.py プロジェクト: caltechlibrary/ames
def match_cd_refs():
    token = os.environ["TINDTOK"]

    matches = []
    collection = "caltechdata.ds"
    keys = dataset.keys(collection)
    if "mediaupdate" in keys:
        keys.remove("mediaupdate")

    # Get event data results
    event_data = "crossref_refs.ds"
    event_keys = dataset.keys(event_data)
    event_keys.remove("captured")
    f_name = "match_cd_refs"
    dot_paths = [".obj_id", ".id", ".subj_id"]
    labels = ["obj_id", "id", "subj_id"]
    print("Getting Event Data Records")
    if dataset.has_frame(event_data, f_name):
        if not dataset.frame_reframe(event_data, f_name, event_keys):
            err = dataset.error_message()
            print(f"Failed to reframe {f_name} in {event_data}, {err}")
            exit()
    elif not dataset.frame_create(event_data, f_name, event_keys, dot_paths, labels):
        err = dataset.error_message()
        print(f"Failed to create frame {f_name} in {event_data}, {err}")
        exit()
    grid = dataset.frame_grid(event_data, f_name)
    df = pd.DataFrame(np.array(grid), columns=["obj_id", "id", "subj_id"])
    grouped = df.groupby(["obj_id"])
    groups = grouped.groups
    # Look at all CaltechDATA records
    for k in keys:
        # Collect matched new links for the record
        record_matches = []
        print(k)
        metadata, err = dataset.read(collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        doi = "https://doi.org/" + metadata["identifier"]["identifier"]
        if doi in groups:
            hits = grouped.get_group(doi)
            for index, h in hits.iterrows():
                # Trigger for whether we already have this link
                new = True
                if "relatedIdentifiers" in metadata:
                    for m in metadata["relatedIdentifiers"]:
                        if m["relatedIdentifier"] in h["subj_id"]:
                            new = False
                if new == True:
                    match = h["subj_id"]
                    print(match)
                    print(h["obj_id"])
                    inputv = input("Do you approve this link?  Type Y or N: ")
                    if inputv == "Y":
                        record_matches.append(match)
        # If we have to update record
        if len(record_matches) > 0:
            ids = []
            if "relatedIdentifiers" in metadata:
                for m in metadata["relatedIdentifiers"]:
                    ids.append(m)
            matches.append([k, record_matches])
            # Now collect identifiers for record
            for match in record_matches:
                split = match.split("doi.org/")
                new_id = {
                    "relatedIdentifier": split[1],
                    "relatedIdentifierType": "DOI",
                    "relationType": "IsCitedBy",
                }
                ids.append(new_id)
            newmetadata = {"relatedIdentifiers": ids}
            response = caltechdata_edit(token, k, newmetadata, {}, {}, True)
            print(response)
    return matches
コード例 #16
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_basic(t, collection_name):
    '''test_basic(collection_name) runs tests on basic CRUD ops'''
    # Setup a test record
    key = "2488"
    value = {
        "title":
        "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
        "formats": ["epub", "kindle", "plain text"],
        "authors": [{
            "given": "Jules",
            "family": "Verne"
        }],
        "url": "https://www.gutenberg.org/ebooks/2488"
    }

    # We should have an empty collection, we will create our test record.
    if dataset.create(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f'create({collection_name}, {key}, {value}) failed, {err}')
        return

    # Check to see that we have only one record
    key_count = dataset.count(collection_name)
    if key_count != 1:
        t.error(f"Failed, expected count to be 1, got {key_count}")

    # Do a minimal test to see if the record looks like it has content
    keyList = dataset.keys(collection_name)
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error(f"epxected {rec[k]} got {v}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error(f"Failed, expected {k} with list v, got {v}")

    # Test updating record
    value["verified"] = True
    if dataset.update(collection_name, key, value) == False:
        err = dataset.error_message()
        t.error(f"update({collection_name}, {key}, {value}) failed, {err}")
    rec, err = dataset.read(collection_name, key)
    if err != "":
        t.error(f"Unexpected error for {key} in {collection_name}, {err}")
    for k, v in value.items():
        if not isinstance(v, list):
            if k in rec and rec[k] == v:
                t.print("OK, found", k, " -> ", v)
            else:
                t.error("expected {rec[k]} got {v} for key {k}")
        else:
            if k == "formats" or k == "authors":
                t.print("OK, expected lists for", k, " -> ", v)
            else:
                t.error("Failed, expected {k} with a list for v, got {v}")

    # Test path to record
    expected_s = "/".join(
        [collection_name, "pairtree", "24", "88", (key + ".json")])
    expected_l = len(expected_s)
    p = dataset.path(collection_name, key)
    if len(p) != expected_l:
        t.error("Failed, expected length", expected_l, "got", len(p))
    if p != expected_s:
        t.error("Failed, expected", expected_s, "got", p)

    # Test listing records
    l = dataset.list(collection_name, [key])
    if len(l) != 1:
        t.error(
            f"list({collection_name}, [{key}]) failed, list should return an array of one record, got",
            l)
        return

    # test deleting a record
    if dataset.delete(collection_name, key) == False:
        err = dataset.error_message()
        t.error("Failed, could not delete record", key, ", ", err)
コード例 #17
0
ファイル: usage.py プロジェクト: caltechlibrary/ames
def aggregate_usage(usage_collection, month_collection):
    keys = dataset.keys(usage_collection)
    keys.remove("end-date")
    for k in progressbar(keys):
        record, err = dataset.read(usage_collection, k)
        if err != "":
            print(err)
        use = {}
        views = {}
        for usage in record["performance"]:
            split = usage["period"].split("-")
            month = split[0] + "-" + split[1]
            for u in usage["instance"]:
                metric = u["metric-type"]
                if metric == "unique-dataset-requests":
                    if month in use:
                        use[month] += u["count"]
                    else:
                        use[month] = u["count"]
                if metric == "unique-dataset-investigations":
                    if month in views:
                        views[month] += u["count"]
                    else:
                        views[month] = u["count"]
        # Strip non-counter stuff
        record.pop("_Key")
        record.pop("grand-total-unique-requests")
        record.pop("grand-total-unique-investigations")
        # go across months
        for view in views:
            split = view.split("-")
            date_obj = datetime(int(split[0]), int(split[1]), 1)
            d_range = get_month_day_range(date_obj)
            performance = [
                {
                    "period": {
                        "begin-date": d_range[0].date().isoformat(),
                        "end-date": d_range[1].date().isoformat(),
                    },
                    "instance": [],
                }
            ]
            v = views[view]
            performance[0]["instance"].append(
                {
                    "count": v,
                    "metric-type": "unique-dataset-investigations",
                    "access-method": "regular",
                }
            )
            # Handle when we have both views and uses in a given month
            if view in use:
                u = use[view]
                performance[0]["instance"].append(
                    {
                        "count": u,
                        "metric-type": "unique-dataset-requests",
                        "access-method": "regular",
                    }
                )
            existing, err = dataset.read(month_collection, view)
            if err != "":
                print(err)
            record["performance"] = performance
            existing["report-datasets"].append(record)
            if not dataset.update(month_collection, view, existing):
                err = dataset.error_message()
                print(err)
        for use_date in use:
            # We only have use-only records left to handle
            if use_date not in views:
                u = use[use_date]
                split = use_date.split("-")
                date_obj = datetime(int(split[0]), int(split[1]), 1)
                d_range = get_month_day_range(date_obj)
                performance = [
                    {
                        "period": {
                            "begin-date": d_range[0].date().isoformat(),
                            "end-date": d_range[1].date().isoformat(),
                        },
                        "instance": [
                            {
                                "count": u,
                                "metric-type": "unique-dataset-requests",
                                "access-method": "regular",
                            }
                        ],
                    }
                ]
                existing, err = dataset.read(month_collection, view)
                if err != "":
                    print(err)
                record["performance"] = performance
                existing["report-datasets"].append(record)
                if not dataset.update(month_collection, view, existing):
                    err = dataset.error_message()
                    print(err)
コード例 #18
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_sync_csv(t, c_name):
    # Setup test collection
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}) failed, {err}')
        return

    # Setup test CSV instance
    t_data = [{
        "_Key": "one",
        "value": 1
    }, {
        "_Key": "two",
        "value": 2
    }, {
        "_Key": "three",
        "value": 3
    }]
    csv_name = c_name.strip(".ds") + ".csv"
    if os.path.exists(csv_name):
        os.remove(csv_name)
    with open(csv_name, 'w') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"])
        csv_writer.writeheader()
        for obj in t_data:
            csv_writer.writerow(obj)

    # Import CSV into collection
    if dataset.import_csv(c_name, csv_name, True) == False:
        err = dataset.error_message()
        t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}')
        return
    for key in ["one", "two", "three"]:
        if dataset.has_key(c_name, key) == False:
            t.error(f"expected has_key({key}) == True, got False")
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key('five') == False, got True")
    if dataset.create(c_name, "five", {"value": 5}) == False:
        err = dataset.error_message()
        t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}')
        return

    # Setup frame
    frame_name = 'test_sync'
    keys = dataset.keys(c_name)
    if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"],
                            ["_Key", "value"]) == False:
        err = dataset.error_message()
        t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}')
        return

    #NOTE: Tests for sync_send_csv and sync_receive_csv
    if dataset.sync_send_csv(c_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}')
        return
    with open(csv_name) as fp:
        src = fp.read()
        if 'five' not in src:
            t.error(f"expected 'five' in src, got {src}")

    # Now remove "five" from collection
    if dataset.delete(c_name, "five") == False:
        err = dataset.error_message()
        t.error(f'delete({c_name}, "five") failed, {err}')
        return
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key(five) == False, got True")
        return
    if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False:
        err = dataset.error_message()
        t.error(
            f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}'
        )
        return
    if dataset.has_key(c_name, "five") == False:
        t.error(f"expected has_key(five) == True, got False")
        return
コード例 #19
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_frame_objects(t, c_name):
    if dataset.status(c_name) == True:
        dataset.close(c_name)
        if os.path.exists(c_name):
            shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}), {err}')
        return
    data = [{
        "id":
        "A",
        "nameIdentifiers": [{
            "nameIdentifier": "0000-000X-XXXX-XXXX",
            "nameIdentifierScheme": "ORCID",
            "schemeURI": "http://orcid.org/"
        }, {
            "nameIdentifier": "H-XXXX-XXXX",
            "nameIdentifierScheme": "ResearcherID",
            "schemeURI": "http://www.researcherid.com/rid/"
        }],
        "two":
        22,
        "three":
        3.0,
        "four": ["one", "two", "three"]
    }, {
        "id": "B",
        "two": 2000,
        "three": 3000.1
    }, {
        "id": "C"
    }, {
        "id":
        "D",
        "nameIdentifiers": [{
            "nameIdentifier": "0000-000X-XXXX-XXXX",
            "nameIdentifierScheme": "ORCID",
            "schemeURI": "http://orcid.org/"
        }],
        "two":
        20,
        "three":
        334.1,
        "four": []
    }]
    keys = []
    dot_paths = [
        "._Key", ".nameIdentifiers", ".nameIdentifiers[:].nameIdentifier",
        ".two", ".three", ".four"
    ]
    labels = [
        "id", "nameIdentifiers", "nameIdentifier", "two", "three", "four"
    ]
    for row in data:
        key = row['id']
        keys.append(key)
        err = dataset.create(c_name, key, row)
    f_name = 'f1'
    if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False:
        err = dataset.error_message()
        t.error(
            f'frame_create({c_name}, {f_name}, {keys}, {dot_paths}, {labels}), {err}'
        )
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) != len(keys):
        t.error(f'expected {len(keys)}, got {len(f_keys)}')
    if dataset.frame_refresh(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'frame_reframe({c_name}, {f_name}), {err}')
    l = dataset.frames(c_name)
    if len(l) != 1 or l[0] != 'f1':
        t.error(f"expected one frame name, f1, got {l}")
    object_result = dataset.frame_objects(c_name, f_name)
    if len(object_result) != 4:
        t.error(
            f'Did not get correct number of objects back, expected 4 got {len(object_result)}, {object_result}'
        )
    count_nameId = 0
    count_nameIdObj = 0
    for obj in object_result:
        if 'id' not in obj:
            t.error('Did not get id in object')
        if 'nameIdentifiers' in obj:
            count_nameId += 1
            for idv in obj['nameIdentifiers']:
                if 'nameIdentifier' not in idv:
                    t.error('Missing part of object')
        if 'nameIdentifier' in obj:
            count_nameIdObj += 1
            if "0000-000X-XXXX-XXXX" not in obj['nameIdentifier']:
                t.error('Missing object in complex dot path')
    if count_nameId != 2:
        t.error(
            f"Incorrect number of nameIdentifiers elements, expected 2, got {count_nameId}"
        )
    if count_nameIdObj != 2:
        t.error(
            f"Incorrect number of nameIdentifier elements, expected 2, got {count_nameIdObj}"
        )
    if dataset.delete_frame(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'delete_frame({c_name}, {f_name}), {err}')
コード例 #20
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_keys(t, collection_name):
    '''test_keys(collection_name) test getting, filter and sorting keys'''
    # Test count after delete
    key_list = dataset.keys(collection_name)
    cnt = dataset.count(collection_name)
    if cnt != 0:
        t.error("Failed, expected zero records, got", cnt, key_list)

    #
    # Generate multiple records for collection for testing keys
    #
    test_records = {
        "gutenberg:21489": {
            "title": "The Secret of the Island",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "http://www.gutenberg.org/ebooks/21489",
            "categories": "fiction, novel"
        },
        "gutenberg:2488": {
            "title":
            "Twenty Thousand Leagues Under the Seas: An Underwater Tour of the World",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jules",
                "family": "Verne"
            }],
            "url": "https://www.gutenberg.org/ebooks/2488",
            "categories": "fiction, novel"
        },
        "gutenberg:21839": {
            "title": "Sense and Sensibility",
            "formats": ["epub", "kindle", "plain text"],
            "authors": [{
                "given": "Jane",
                "family": "Austin"
            }],
            "url": "http://www.gutenberg.org/ebooks/21839",
            "categories": "fiction, novel"
        },
        "gutenberg:3186": {
            "title": "The Mysterious Stranger, and Other Stories",
            "formats": ["epub", "kindle", "plain text", "html"],
            "authors": [{
                "given": "Mark",
                "family": "Twain"
            }],
            "url": "http://www.gutenberg.org/ebooks/3186",
            "categories": "fiction, short story"
        },
        "hathi:uc1321060001561131": {
            "title":
            "A year of American travel - Narrative of personal experience",
            "formats": ["pdf"],
            "authors": [{
                "given": "Jessie Benton",
                "family": "Fremont"
            }],
            "url":
            "https://babel.hathitrust.org/cgi/pt?id=uc1.32106000561131;view=1up;seq=9",
            "categories": "non-fiction, memoir"
        }
    }
    test_count = len(test_records)

    for k in test_records:
        v = test_records[k]
        if dataset.create(collection_name, k, v) == False:
            err = dataset.error_message()
            t.error("Failed, could not add", k, "to", collection_name, ', ',
                    err)

    # Test keys, filtering keys and sorting keys
    all_keys = dataset.keys(collection_name)
    if len(all_keys) != test_count:
        t.error("Expected", test_count, "all_keys back, got", keys)

    #dataset.verbose_on()
    filter_expr = '(eq .categories "non-fiction, memoir")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 1:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expre}), Expected one key for",
            filter_expr, "got", filtered_keys)

    filter_expr = '(contains .categories "novel")'
    filtered_keys = dataset.key_filter(collection_name, all_keys, filter_expr)
    if len(filtered_keys) != 3:
        t.error(
            f"key_filter({collection_name}, {keys}, {filter_expr}), Expected three keys for",
            filter_expr, "got", filtered_keys)

    sort_expr = '+.title'
    filter_expr = '(contains .categories "novel")'
    sorted_keys = dataset.key_sort(collection_name, filtered_keys, sort_expr)
    if len(sorted_keys) != 3:
        t.error(
            f"key_sort({collection_name}, {filtered_keys}, {sort_expr}), Expected three keys for",
            filter_expr, "got", sorted_keys)
    expected_keys = ["gutenberg:21839", "gutenberg:21489", "gutenberg:2488"]
    for i, k in enumerate(expected_keys):
        if i < len(sorted_keys) and sorted_keys[i] != k:
            obj1, _ = dataset.read(collection_name, k)
            obj2, _ = dataset.read(collection_name, sorted_keys[i])
            t.error(
                f'key_sort({collection_name}, {filtered_keys}, {sort_expr}), ({q}) Expected {k} (title "{obj1["title"]}) got {keys[i]} (title "{obj2["title"]}")'
            )
コード例 #21
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_issue43(t, collection_name, csv_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    if os.path.exists(csv_name):
        os.remove(csv_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'Failed, need a {collection_name} to run test, {err}')
        return
    table = {
        "r1": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        },
        "r2": {
            "c1": "one",
            "c3": "three",
            "c4": "four"
        },
        "r3": {
            "c1": "one",
            "c2": "two",
            "c4": "four"
        },
        "r4": {
            "c1": "one",
            "c2": "two",
            "c3": "three"
        },
        "r5": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        }
    }
    for key in table:
        row = table[key]
        if dataset.create(collection_name, key, row) == False:
            err = dataset.error_message()
            t.error(f"Can't add test row {key} to {collection_name}, {err}")
            return

    dataset.use_strict_dotpath(False)
    # Setup frame
    frame_name = 'f1'
    keys = dataset.keys(collection_name)
    if dataset.frame_create(collection_name, frame_name, keys,
                            ["._Key", ".c1", ".c2", ".c3", ".c4"],
                            ["_Key", "c1", "c2", "c3", "c4"]) == False:
        err = dataset.error_message()
        t.error(err)
        return
    if dataset.export_csv(collection_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}'
        )
        return
    with open(csv_name, mode='r', encoding='utf-8') as f:
        rows = f.read()

    for row in rows.split('\n'):
        if len(row) > 0:
            cells = row.split(',')
            if len(cells) < 5:
                t.error(f'row error {csv_name} for {cells}')
コード例 #22
0
ファイル: test_dataset.py プロジェクト: iamgy/py_dataset
def test_attachments(t, collection_name):
    t.print("Testing attach, attachments, detach and prune")
    # Generate two files to attach.
    with open('a1.txt', 'w') as text_file:
        text_file.write('This is file a1')
    with open('a2.txt', 'w') as text_file:
        text_file.write('This is file a2')
    filenames = ['a1.txt', 'a2.txt']

    if dataset.status(collection_name) == False:
        t.error("Failed,", collection_name, "missing")
        return
    keys = dataset.keys(collection_name)
    if len(keys) < 1:
        t.error("Failed,", collection_name, "should have keys")
        return

    key = keys[0]
    if dataset.attach(collection_name, key, filenames) == False:
        err = dataset.error_message()
        t.error("Failed, to attach files for", collection_name, key, filenames,
                ', ', err)
        return

    l = dataset.attachments(collection_name, key)
    if len(l) != 2:
        t.error("Failed, expected two attachments for", collection_name, key,
                "got", l)
        return

    #Check that attachments arn't impacted by update
    if dataset.update(collection_name, key, {"testing": "update"}) == False:
        err = dataset.error_message()
        t.error("Failed, to update record", collection_name, key, err)
        return
    l = dataset.attachments(collection_name, key)
    if len(l) != 2:
        t.error("Failed, expected two attachments after update for",
                collection_name, key, "got", l)
        return

    if os.path.exists(filenames[0]):
        os.remove(filenames[0])
    if os.path.exists(filenames[1]):
        os.remove(filenames[1])

    # First try detaching one file.
    if dataset.detach(collection_name, key, [filenames[1]]) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for", collection_name, key,
                filenames[1], ', ', err)
    if os.path.exists(filenames[1]):
        os.remove(filenames[1])
    else:
        t.error("Failed to detch", filenames[1], "from", collection_name, key)

    # Test explicit filenames detch
    if dataset.detach(collection_name, key, filenames) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for", collection_name, key, filenames,
                ', ', err)

    for fname in filenames:
        if os.path.exists(fname):
            os.remove(fname)
        else:
            t.error("Failed, expected", fname, "to be detached from",
                    collection_name, key)

    # Test detaching all files
    if dataset.detach(collection_name, key, []) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for (detaching all)", collection_name,
                key, ', ', err)
    for fname in filenames:
        if os.path.exists(fname):
            os.remove(fname)
        else:
            t.error("Failed, expected", fname, "for detaching all from",
                    collection_name, key)

    if dataset.prune(collection_name, key, [filenames[0]]) == False:
        err = dataset.error_messag()
        t.error("Failed, expected True for prune", collection_name, key,
                [filenames[0]], ', ', err)
    l = dataset.attachments(collection_name, key)
    if len(l) != 1:
        t.error("Failed, expected one file after prune for", collection_name,
                key, [filenames[0]], "got", l)

    if dataset.prune(collection_name, key, []) == False:
        err = dataset.error_message()
        t.error("Failed, expected True for prune (all)", collection_name, key,
                ', ', err)
    l = dataset.attachments(collection_name, key)
    if len(l) != 0:
        t.error("Failed, expected zero files after prune for", collection_name,
                key, "got", l)