Example #1
0
def make_link_history(collection, resolver, url, note):
    """Make an entry in our link history collection"""
    now = datetime.today().isoformat()
    # Run checks on both resoler and final URL
    try:
        target = requests.get(url)
    except requests.exceptions.ConnectionError:
        target = requests.Response()
        target.status_code = 404
        target.url = ''
    if target.status_code != 200:
        print(
            f"Target URL {url} returns Error status code {target.status_code}")
    if links_differ(target.url, url):
        print(f"Target URL '{url}' redirects to '{target.url}'")
    try:
        get = requests.get(f"https://resolver.library.caltech.edu/{resolver}")
    except requests.exceptions.ConnectionError:
        get = requests.Response()
        get.status_code = 404
        get.url = ''
    if links_differ(get.url, url):
        print(f"Mismatch between expected url '{url}' and actual '{get.url}'")
    if get.status_code != 200:
        print(
            f"Resolver URL ({resolver}) '{get.url}' returns Error status code {get.status_code}"
        )
    entry = {
        "expected-url": url,
        "url": get.url,
        "modified": now,
        "code": get.status_code,
        "note": note,
    }
    # If existing, push into history
    if dataset.has_key(collection, resolver):
        existing, err = dataset.read(collection, resolver)
        if err != "":
            print(err)
            exit()
        if save_history(existing, url, get):
            past_history = existing.pop("history")
            past_history.append(existing)
            entry["history"] = past_history
            if not dataset.update(collection, resolver, entry):
                print(dataset.error_message())
                exit()
    else:
        entry["history"] = []
        if not dataset.create(collection, resolver, entry):
            print(dataset.error_message())
            exit()
Example #2
0
def test_join(t, collection_name):
    key = "test_join1"
    obj1 = {"one": 1}
    obj2 = {"two": 2}
    if dataset.status(collection_name) == False:
        t.error("Failed, collection status is False,", collection_name)
        return
    ok = dataset.has_key(collection_name, key)
    err = ''
    if ok == True:
        ok = dataset.update(collection_nane, key, obj1)
    else:
        ok = dataset.create(collection_name, key, obj1)
    if ok == False:
        err = dataset.error_message()
        t.error(
            f'Failed, could not add record for test ({collection_name}, {key}, {obj1}), {err}'
        )
        return
    if dataset.join(collection_name, key, obj2, overwrite=False) == False:
        err = dataset.error_message()
        t.error(
            f'Failed, join for {collection_name}, {key}, {obj2}, overwrite = False -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    if obj_result.get('one') != 1:
        t.error(f'Failed to join append key {key}, {obj_result}')
    if obj_result.get("two") != 2:
        t.error(f'Failed to join append key {key}, {obj_result}')
    obj2['one'] = 3
    obj2['two'] = 3
    obj2['three'] = 3
    if dataset.join(collection_name, key, obj2, overwrite=True) == False:
        err = dataset.error_message()
        t.error(
            f'Failed to join overwrite {collection_name}, {key}, {obj2}, overwrite = True -> {err}'
        )
    obj_result, err = dataset.read(collection_name, key)
    if err != '':
        t.error(f'Unexpected error for {key} in {collection_name}, {err}')
    for k in obj_result:
        if k != '_Key' and obj_result[k] != 3:
            t.error('Failed to update value in join overwrite', k, obj_result)
Example #3
0
def get_history(collection, caltechdata_collection, caltechdata_keys):
    """Harvest the history of records from CaltechDATA."""

    keys_to_update = []
    if os.path.exists("historyupdate"):
        with open("historyupdate", "r") as infile:
            update = date.fromisoformat(infile.read())
    else:
        # Arbitrary old date - everything will be updated
        update = date(2011, 1, 1)
    for k in progressbar(caltechdata_keys, redirect_stdout=True):
        existing, err = dataset.read(caltechdata_collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        record_update = datetime.fromisoformat(existing["updated"]).date()
        if record_update > update:
            keys_to_update.append(k)

    if not os.path.isdir(collection):
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = "https://data.caltech.edu/records/"

    for k in progressbar(keys_to_update):
        url = base_url + str(k) + "/revisions"
        response = requests.get(url)
        revisions = response.json()
        for num, metadata in enumerate(revisions):
            key = f"{k}-{num}"
            if dataset.has_key(collection, key) == False:
                dataset.create(collection, key, metadata)

    # Save date in file
    today = date.today().isoformat()
    with open("historyupdate", "w") as outfile:
        outfile.write(today)
Example #4
0
def get_crossref_refs(prefix, done=False, new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "crossref_refs.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = (
        "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix="
        + prefix)

    collected = dataset.has_key(collection, "captured")

    cursor = ""
    count = 0
    while cursor != None:
        if collected == True:
            date, err = dataset.read(collection, "captured")
            if err != "":
                print("error on read: " + err)
            date = date["captured"]
            print(date)
            url = base_url + "&from-collected-date=" + date
        else:
            url = base_url
        if cursor != "":
            url = url + "&cursor=" + cursor
        print(url)
        r = requests.get(url)
        records = r.json()
        if records["status"] == "failed":
            print(records)
            break
        for rec in records["message"]["events"]:
            # Save results in dataset
            print(count, rec["id"])
            count = count + 1  # Just for prettyness
            if not dataset.create(collection, rec["id"], rec):
                err = dataset.error_message()
                print("Error in saving record: " + err)

        if cursor == records["message"]["next-cursor"]:
            # Catches bug where we get the same curser back at end of results
            break
        if records["message"]["total-results"] > count:
            cursor = records["message"]["next-cursor"]
        else:
            cursor = None

    if collected == True:
        date, err = dataset.read(collection, "captured")
        if err != "":
            print("Error in reading date: " + err)
        date = date["captured"]

        # Check Deleted
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Delete results in dataset
                print("Deleted: ", rec["id"])
                if not dataset.delete(collection, rec["id"]):
                    err = dataset.error_message()
                    print(f"Unexpected error on read: {err}")
            cursor = records["message"]["next-cursor"]

        # Check Edited
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Update results in dataset
                print("Update: ", rec["id"])
                if not dataset.update(collection, rec["id"], rec):
                    err = dataset.error_message()
                    print(f"Unexpected error on write: {err}")
            cursor = records["message"]["next-cursor"]

    if done:
        date = datetime.date.today().isoformat()
        record = {"captured": date}
        if dataset.has_key(collection, "captured"):
            if not dataset.update(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on update: {err}")
        else:
            if not dataset.create(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on create: {err}")
Example #5
0
if os.path.exists("boutique.ds") == False:
    dataset.init("boutique.ds")

# crawl docs_dir and ingest files into data collection.
for path, folders, files in os.walk(docs_dir):
    #log.print(f"Processing {path}")
    for filename in files:
        if filename.endswith(".md"):
            f_name = os.path.join(path, filename)
            log.print(f"Ingesting {f_name}")
            metadata = frontmatter(f_name)
            with open(f_name) as f:
                src = f.read()
            if "id" in metadata:
                key = str(metadata["id"])
                if dataset.has_key(c_name, key):
                    err = dataset.update(c_name, key, {
                        "metadata": metadata,
                        "content": f_name,
                        "src": src
                    })
                else:
                    err = dataset.create(c_name, key, {
                        "metadata": metadata,
                        "content": f_name,
                        "src": src
                    })
                if err != "":
                    log.fatal(err)
            else:
                log.print(f"Warning, no front matter for {f_name}")
Example #6
0
def get_wos_refs(new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "all_wos.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        ok = dataset.init(collection)
        if ok == False:
            print("Dataset failed to init collection")
            exit()

    # Get access token from WOS sed as environment variable with source token.bash
    token = os.environ["WOSTOK"]

    headers = {"X-ApiKey": token, "Content-type": "application/json"}

    # Run query to get scope of records

    base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK"

    collected = dataset.has_key(collection, "captured")

    if collected == True:
        date = dataset.read(collection, "captured")
        date = date[0]["captured"]
        date = datetime.fromisoformat(date)
        current = datetime.today()
        diff = current - date
        base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D"

    date = datetime.today().isoformat()
    record = {"captured": date}
    if dataset.has_key(collection, "captured"):
        err = dataset.update(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on update: {err}")
    else:
        err = dataset.create(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on create: {err}")

    query = "OG=(California Institute of Technology)"
    query = urllib.parse.quote_plus(query)
    url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1"

    response = requests.get(url, headers=headers)
    response = response.json()
    record_count = response["QueryResult"]["RecordsFound"]
    print(record_count, " Records from WOS")
    query_id = response["QueryResult"]["QueryID"]
    try:
        records = response["Data"]["Records"]["records"]["REC"]
    except:
        print(response)
    write_records(records, collection)
    # We have saved the first 100 records
    record_start = 101
    record_count = record_count - 100

    query_url = "https://api.clarivate.com/api/wos/query/"

    while record_count > 0:
        print(record_count)
        print(len(records), "records")
        if record_count > 100:
            url = (
                query_url
                + str(query_id)
                + "?count=100&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            try:
                records = response["Records"]["records"]["REC"]
            except:
                print(response)
            write_records(records, collection)
            record_start = record_start + 100
            record_count = record_count - 100
        else:
            url = (
                query_url
                + str(query_id)
                + "?count="
                + str(record_count)
                + "&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            records = response["Records"]["records"]["REC"]
            write_records(records, collection)
            record_count = 0

    print("Downloaded all records ")
Example #7
0
def test_sync_csv(t, c_name):
    # Setup test collection
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}) failed, {err}')
        return

    # Setup test CSV instance
    t_data = [{
        "_Key": "one",
        "value": 1
    }, {
        "_Key": "two",
        "value": 2
    }, {
        "_Key": "three",
        "value": 3
    }]
    csv_name = c_name.strip(".ds") + ".csv"
    if os.path.exists(csv_name):
        os.remove(csv_name)
    with open(csv_name, 'w') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"])
        csv_writer.writeheader()
        for obj in t_data:
            csv_writer.writerow(obj)

    # Import CSV into collection
    if dataset.import_csv(c_name, csv_name, True) == False:
        err = dataset.error_message()
        t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}')
        return
    for key in ["one", "two", "three"]:
        if dataset.has_key(c_name, key) == False:
            t.error(f"expected has_key({key}) == True, got False")
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key('five') == False, got True")
    if dataset.create(c_name, "five", {"value": 5}) == False:
        err = dataset.error_message()
        t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}')
        return

    # Setup frame
    frame_name = 'test_sync'
    keys = dataset.keys(c_name)
    if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"],
                            ["_Key", "value"]) == False:
        err = dataset.error_message()
        t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}')
        return

    #NOTE: Tests for sync_send_csv and sync_receive_csv
    if dataset.sync_send_csv(c_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}')
        return
    with open(csv_name) as fp:
        src = fp.read()
        if 'five' not in src:
            t.error(f"expected 'five' in src, got {src}")

    # Now remove "five" from collection
    if dataset.delete(c_name, "five") == False:
        err = dataset.error_message()
        t.error(f'delete({c_name}, "five") failed, {err}')
        return
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key(five) == False, got True")
        return
    if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False:
        err = dataset.error_message()
        t.error(
            f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}'
        )
        return
    if dataset.has_key(c_name, "five") == False:
        t.error(f"expected has_key(five) == True, got False")
        return
Example #8
0
def get_cd_github(new=True):

    collection = "github_records.ds"

    if new == True:
        os.system("rm -rf " + collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    url = "https://data.caltech.edu/api/records"

    response = requests.get(url + "/?size=1000&q=subjects:GitHub")
    hits = response.json()

    for h in hits["hits"]["hits"]:
        rid = str(h["id"])
        record = h["metadata"]

        result = dataset.has_key(collection, rid)

        if result == False:

            dataset.create(collection, rid, record)

            print("Downloading files for ", rid)

            codemeta = False

            for erecord in record["electronic_location_and_access"]:
                f = download_file(erecord, rid)

                # We're just looking for the zip file
                if f.split(".")[-1] == "zip":
                    zip_files = subprocess.check_output(
                        ["unzip", "-l", f.rstrip()],
                        universal_newlines=True).splitlines()
                    i = 4  # Ignore header
                    line = zip_files[i]
                    while line[0] != "-":
                        split = line.split("/")
                        fname = split[1]
                        if fname == "codemeta.json":
                            sp = line.split("   ")[-1]
                            os.system("unzip -j " + f.rstrip() + " " + sp +
                                      " -d .")
                            codemeta = True
                        i = i + 1
                        line = zip_files[i]
                        # Will only identify codemeta files in root of repo

                # Trash downloaded files - extracted codemeta.json not impacted
                print("Trash " + f)
                os.system("rm " + f)

            if codemeta == True:
                print(collection, rid)
                response = dataset.attach(collection, rid, ["codemeta.json"])
                print("Attachment ", response)
                os.system("rm codemeta.json")
                print("Trash codemeta.json")