Exemple #1
0
def test_setup(t, collection_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error("init({collection_name}) failed, {err}")
        return
Exemple #2
0
def get_caltechdata(collection, production=True, datacite=False):
    """Harvest all records from CaltechDATA .
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    if production == True:
        url = "https://data.caltech.edu/api/records"
    else:
        url = "https://cd-sandbox.tind.io/api/records"

    response = requests.get(url + "/?size=9000")
    hits = response.json()

    print(hits)
    for h in progressbar(hits["hits"]["hits"]):
        rid = str(h["id"])
        # Get enriched metadata records (including files)
        if datacite == False:
            metadata = decustomize_schema(h["metadata"], True, True, True)
            metadata["updated"] = h["updated"]
        else:
            # Get just DataCite metadata
            metadata = decustomize_schema(h["metadata"])

        if not dataset.create(collection, rid, metadata):
            err = dataset.error_message()
            print(err)
Exemple #3
0
def build_usage(caltechdata_collection, usage_collection):
    """Build collection of records that contain CaltechDATA usage
    information"""
    if not os.path.isdir(usage_collection):
        if not dataset.init(usage_collection):
            print("Dataset failed to init collection")
            exit()
        # Write date to start collecting statistics for new collection
        dataset.create(usage_collection, "end-date", {"end-date": 1485907200})
    # Build out structure for all CaltechDATA records
    ids = dataset.keys(caltechdata_collection)
    for k in ids:
        if dataset.has_key(usage_collection, k) == False:
            metadata, err = dataset.read(caltechdata_collection, k)
            # When record was submitted to CaltechDATA:
            rdate = None
            submitted = None
            issued = None
            if "dates" in metadata:
                doi = metadata["identifier"]["identifier"]
                for date in metadata["dates"]:
                    if date["dateType"] == "Submitted":
                        rdate = date["date"]
                    if date["dateType"] == "Updated":
                        submitted = date["date"]
                    if date["dateType"] == "Issued":
                        issued = date["date"]
                if rdate == None:
                    if submitted != None:
                        rdate = submitted
                    else:
                        rdate = issued
            else:
                # Dummy values for junk records
                rdate = "2020-04-01"
                doi = ""
            # Dataset is the only supported type in the spec and we are
            # following the dataset standards for usage
            # All dates are the date added to CaltechDATA, which is
            # the apropriate 'publication' date even if content was available
            # earlier
            record_data = {
                "dataset-id": [{"type": "doi", "value": doi}],
                "uri": "https://data.caltech.edu/records/" + k,
                "publisher": "CaltechDATA",
                "platform": "CaltechDATA",
                "publisher-id": [{"type": "grid", "value": "grid.20861.3d"}],
                "yop": rdate.split("-")[0],
                "data-type": "dataset",
                "dataset-dates": [{"type": "pub-date", "value": rdate}],
                "dataset-title": metadata["titles"][0]["title"],
                "performance": [],
                "grand-total-unique-investigations": 0,
                "grand-total-unique-requests": 0,
            }
            if not dataset.create(usage_collection, k, record_data):
                err = dataset.error_message()
                print(err)
                exit()
Exemple #4
0
def test_issue12(t, c_name):
    src = '''[
{"id": "1", "c1": 1, "c2": 2, "c3": 3 },
{"id": "2", "c1": 2, "c2": 2, "c3": 3 },
{"id": "3", "c1": 3, "c2": 3, "c3": 3 },
{"id": "4", "c1": 1, "c2": 1, "c3": 1 },
{"id": "5", "c1": 6, "c2": 6, "c3": 6 }
]'''
    #dataset.verbose_on() # DEBUG
    #dataset.use_strict_dotpath(True) # DEBUG
    if dataset.status(c_name) == False:
        if not dataset.init(c_name):
            err = dataset.error_message()
            t.error(f'failed to create {c_name}')
            return
    objects = json.loads(src)
    for obj in objects:
        key = obj['id']
        if dataset.has_key(c_name, key):
            dataset.update(c_name, key, obj)
        else:
            dataset.create(c_name, key, obj)
    f_names = dataset.frames(c_name)
    for f_name in f_names:
        ok = dataset.delete_frame(c_name, f_name)
        if ok == False:
            err = dataset.error_message()
            t.error(f'Failed to delete {f_name} from {c_name} -> "{err}"')
            return
        if dataset.has_frame(c_name, f_name) == True:
            t.error(
                f'Failed to delete frame {c_name} from {c_name}, frame still exists'
            )
            return
    f_name = 'issue12'
    dot_paths = [".c1", "c3"]
    labels = [".col1", ".col3"]
    keys = dataset.keys(c_name)
    if not dataset.frame_create(c_name, f_name, keys, dot_paths, labels):
        err = dataset.error_message()
        t.error(f'failed to create {f_name} from {c_name}, {err}')
    if not dataset.has_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected frame {f_name} to exists, {err}')
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) == 0:
        err = dataset.error_message()
        t.error(f'expected keys in {f_name}, got zero, {err}')
        return
    f_objects = dataset.frame_objects(c_name, f_name)
    if len(f_objects) == 0:
        err = dataset.error_message()
        t.error(f'expected objects in {f_name}, got zero, {err}')
        return
    if not dataset.delete_frame(c_name, f_name):
        err = dataset.error_message()
        t.error(f'expected to delete {f_name} in {c_name}, {err}')
Exemple #5
0
def test_check_repair(t, collection_name):
    t.print("Testing status on", collection_name)
    # Make sure we have a left over collection to check and repair
    if os.path.exists(collection_name) == True:
        shutil.rmtree(collection_name)
    if dataset.status(collection_name) == True:
        dataset.close(collection_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'init({collection_name}) failed, {err}')
        return
    if dataset.status(collection_name) == False:
        t.error(
            f"Failed, expected dataset.status() == True, got False for {collection_name}"
        )
        return

    if dataset.has_key(collection_name, 'one') == False:
        if dataset.create(collection_name, 'one', {"one": 1}) == False:
            err = dataset.error_message()
            t.error(
                f'create({collection_name}, "one", {"one": 1}) failed, {err}')
    t.print(f"Testing check on {collection_name}")
    # Check our collection
    if not (dataset.check(collection_name) == True):
        err = dataset.error_message()
        t.error(
            "Failed, (before break) expected check True, got False for {collection_name} (err: {err})"
        )
        return

    # Break and recheck our collection
    print(f"Removing {collection_name}/collection.json to cause a fail")
    if os.path.exists(collection_name + "/collection.json"):
        os.remove(collection_name + "/collection.json")
    print(f"Testing check on (broken) {collection_name}")
    if not (dataset.check(collection_name) == False):
        err = dataset.error_message()
        t.error(
            f"Failed, (after break) expected check False got True for {collection_name} (err: {err})"
        )
    else:
        t.print(f"Should have see error output for broken {collection_name}")

    # Repair our collection
    t.print("Testing repair on", collection_name)
    if dataset.repair(collection_name) == False:
        err = dataset.error_message()
        t.error("Failed, expected repair to return True, got, ", err)
    if os.path.exists(os.path.join(collection_name,
                                   "collection.json")) == False:
        t.error(
            f"Failed, expected recreated {collection_name}/collection.json")
Exemple #6
0
def test_frame(t, c_name):
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(err)
        return
    data = [{
        "id": "A",
        "one": "one",
        "two": 22,
        "three": 3.0,
        "four": ["one", "two", "three"]
    }, {
        "id": "B",
        "two": 2000,
        "three": 3000.1
    }, {
        "id": "C"
    }, {
        "id": "D",
        "one": "ONE",
        "two": 20,
        "three": 334.1,
        "four": []
    }]
    keys = []
    dot_paths = ["._Key", ".one", ".two", ".three", ".four"]
    labels = ["_Key", "one", "two", "three", "four"]
    for row in data:
        key = row['id']
        keys.append(key)
        dataset.create(c_name, key, row)
    f_name = 'f1'
    if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False:
        err = dataset.error_message()
        t.error(err)
    if dataset.frame_reframe(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(err)
    l = dataset.frames(c_name)
    if len(l) != 1 or l[0] != 'f1':
        t.error(f"expected one frame name, f1, got {l}")
    if dataset.delete_frame(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'delete_frame({c_name}, {f_name}), {err}')
Exemple #7
0
def build_aggregate(collection):
    """Build a collection for usage by month.
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    # Find time periods
    start = datetime.fromisoformat("2017-01-01")
    today = datetime.today().date().isoformat()
    date_list = pd.date_range(start, today, freq="MS").strftime("%Y-%m").to_list()

    for month in date_list:
        if not dataset.create(collection, month, {"report-datasets": []}):
            err = dataset.error_message()
            print(err)
Exemple #8
0
def get_history(collection, caltechdata_collection, caltechdata_keys):
    """Harvest the history of records from CaltechDATA."""

    keys_to_update = []
    if os.path.exists("historyupdate"):
        with open("historyupdate", "r") as infile:
            update = date.fromisoformat(infile.read())
    else:
        # Arbitrary old date - everything will be updated
        update = date(2011, 1, 1)
    for k in progressbar(caltechdata_keys, redirect_stdout=True):
        existing, err = dataset.read(caltechdata_collection, k)
        if err != "":
            print(f"Unexpected error on read: {err}")
        record_update = datetime.fromisoformat(existing["updated"]).date()
        if record_update > update:
            keys_to_update.append(k)

    if not os.path.isdir(collection):
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = "https://data.caltech.edu/records/"

    for k in progressbar(keys_to_update):
        url = base_url + str(k) + "/revisions"
        response = requests.get(url)
        revisions = response.json()
        for num, metadata in enumerate(revisions):
            key = f"{k}-{num}"
            if dataset.has_key(collection, key) == False:
                dataset.create(collection, key, metadata)

    # Save date in file
    today = date.today().isoformat()
    with open("historyupdate", "w") as outfile:
        outfile.write(today)
def build_collection(collection):
    # We start from scratch with a new dataset collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    ok = dataset.init(collection)
    if ok == False:
        print("Dataset failed to init collection")
        exit()

    # Sync metdata from google sheet
    gsheet_id = '1er6yYk-7jcySyX7bqADIC_TrfSDTxwAwDh_hXSxSdoU'
    gsheet_name = 'data4tom'
    #The column for key values, starting at 1, in this case SRR
    id_col = 14
    #Range of cells to import. This is basically all, can modify to exclude
    #portions of gsheet
    cell_range = "A1:ZZ"

    err = dataset.import_gsheet(collection, gsheet_id, gsheet_name, id_col,
                                cell_range)
    if err != '':
        print(err)
        exit()
Exemple #10
0
parser.add_argument('data_collection', nargs=1, help=\
            'file name for the dataset collection with harvested data')
parser.add_argument('input_sheet', nargs=1, help=\
        'Input Google Sheet ID with author citations')
parser.add_argument('output_sheet', nargs=1, help='Output Google Sheet ID')
parser.add_argument('-limited', action='store_true', help=\
        'Save only the first three authors')
args = parser.parse_args()

name = args.data_collection[0]
sheet = args.input_sheet[0]
output_sheet = args.output_sheet[0]

import_coll = "imported.ds"
os.system("rm -rf imported.ds")
dataset.init(import_coll)

os.environ['GOOGLE_CLIENT_SECRET_JSON'] = "/etc/client_secret.json"
err = dataset.import_gsheet(import_coll, sheet, 'Sheet1', 1, 'A:CZ')
if err != '':
    print(err)

keys = dataset.keys(import_coll)

coauthors = []

count = 0
for key in progressbar(keys, redirect_stdout=True):
    record, err = dataset.read(name, key)
    if err != "":
        print(err)
Exemple #11
0
def test_sync_csv(t, c_name):
    # Setup test collection
    if os.path.exists(c_name):
        shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}) failed, {err}')
        return

    # Setup test CSV instance
    t_data = [{
        "_Key": "one",
        "value": 1
    }, {
        "_Key": "two",
        "value": 2
    }, {
        "_Key": "three",
        "value": 3
    }]
    csv_name = c_name.strip(".ds") + ".csv"
    if os.path.exists(csv_name):
        os.remove(csv_name)
    with open(csv_name, 'w') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=["_Key", "value"])
        csv_writer.writeheader()
        for obj in t_data:
            csv_writer.writerow(obj)

    # Import CSV into collection
    if dataset.import_csv(c_name, csv_name, True) == False:
        err = dataset.error_message()
        t.error(f'import_csv({c_name}, {csv_name}, True) failed, {err}')
        return
    for key in ["one", "two", "three"]:
        if dataset.has_key(c_name, key) == False:
            t.error(f"expected has_key({key}) == True, got False")
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key('five') == False, got True")
    if dataset.create(c_name, "five", {"value": 5}) == False:
        err = dataset.error_message()
        t.error(f'create({c_name}, "five", {"value": 5}) failed, {err}')
        return

    # Setup frame
    frame_name = 'test_sync'
    keys = dataset.keys(c_name)
    if dataset.frame_create(c_name, frame_name, keys, ["._Key", ".value"],
                            ["_Key", "value"]) == False:
        err = dataset.error_message()
        t.error(f'frame_create({c_name}, {frame_name}, ...) failed, {err}')
        return

    #NOTE: Tests for sync_send_csv and sync_receive_csv
    if dataset.sync_send_csv(c_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'sync_send_csv({c_name}, {frame_name}, {csv_name}) failed, {err}')
        return
    with open(csv_name) as fp:
        src = fp.read()
        if 'five' not in src:
            t.error(f"expected 'five' in src, got {src}")

    # Now remove "five" from collection
    if dataset.delete(c_name, "five") == False:
        err = dataset.error_message()
        t.error(f'delete({c_name}, "five") failed, {err}')
        return
    if dataset.has_key(c_name, "five") == True:
        t.error(f"expected has_key(five) == False, got True")
        return
    if dataset.sync_recieve_csv(c_name, frame_name, csv_name, False) == False:
        err = dataset.error_message()
        t.error(
            f'sync_receive_csv({c_name}, {frame_name}, {csv_name}) failed, {err}'
        )
        return
    if dataset.has_key(c_name, "five") == False:
        t.error(f"expected has_key(five) == True, got False")
        return
                        action="store_true",
                        help="Get resolver links from DataCite")

    args = parser.parse_args()

    # S3 Setup
    session = boto3.Session(profile_name="resolver")
    current_region = session.region_name
    bucket = "resolver.library.caltech.edu"
    s3 = session.resource("s3")

    collection = "link_history.ds"
    if os.path.isdir(collection) == False:
        make_s3_record(s3, bucket, "index.html",
                       "https://libguides.caltech.edu/CODA")
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    # Get the links that already exist
    links = dataset.keys(collection)
    if args.update:
        # Everything will get updated
        links = []

    # Get DOI links
    if args.dois:
        client_ids = [
            "tind.caltech",
            "caltech.library",
            "caltech.ipacdoi",
        cmd.append("-password")
        cmd.append(eprint_password)
    cmd.append("-export")
    cmd.append("all")
    p = run(cmd)
    exit_code = p.returncode
    if exit_code != 0:
        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
        sys.exit(1)


c_name = "campuspubs.ds"

ok = dataset.status(c_name)
if ok == False:
    err = dataset.init(c_name)
    if err != "":
        print(f"{c_name}, {err}")

harvest = False

if harvest == True:
    username = os.environ["USER"]
    password = os.environ["PW"]
    returnc = ep_full(
        c_name, "https://caltechcampuspubs.library.caltech.edu/", username, password
    )
    print(returnc)

keys = dataset.keys(c_name)
for key in keys:
        cmd.append("-password")
        cmd.append(eprint_password)
    cmd.append("-export")
    cmd.append("all")
    p = run(cmd)
    exit_code = p.returncode
    if exit_code != 0:
        print(f"ERROR: {' '.join(cmd)}, exit code {exit_code}")
        sys.exit(1)


c_name = "oh.ds"

ok = dataset.status(c_name)
if ok == False:
    err = dataset.init(c_name, layout="pairtree")
    if err != "":
        print(f"{c_name}, {err}")

harvest = False

if harvest == True:
    username = os.environ["USER"]
    password = os.environ["PW"]
    returnc = ep_full(
        c_name, "http://oralhistories.library.caltech.edu/", username, password
    )
    print(returnc)

keys = dataset.keys(c_name)
for key in keys:
Exemple #15
0
def test_issue43(t, collection_name, csv_name):
    if os.path.exists(collection_name):
        shutil.rmtree(collection_name)
    if os.path.exists(csv_name):
        os.remove(csv_name)
    if dataset.init(collection_name) == False:
        err = dataset.error_message()
        t.error(f'Failed, need a {collection_name} to run test, {err}')
        return
    table = {
        "r1": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        },
        "r2": {
            "c1": "one",
            "c3": "three",
            "c4": "four"
        },
        "r3": {
            "c1": "one",
            "c2": "two",
            "c4": "four"
        },
        "r4": {
            "c1": "one",
            "c2": "two",
            "c3": "three"
        },
        "r5": {
            "c1": "one",
            "c2": "two",
            "c3": "three",
            "c4": "four"
        }
    }
    for key in table:
        row = table[key]
        if dataset.create(collection_name, key, row) == False:
            err = dataset.error_message()
            t.error(f"Can't add test row {key} to {collection_name}, {err}")
            return

    dataset.use_strict_dotpath(False)
    # Setup frame
    frame_name = 'f1'
    keys = dataset.keys(collection_name)
    if dataset.frame_create(collection_name, frame_name, keys,
                            ["._Key", ".c1", ".c2", ".c3", ".c4"],
                            ["_Key", "c1", "c2", "c3", "c4"]) == False:
        err = dataset.error_message()
        t.error(err)
        return
    if dataset.export_csv(collection_name, frame_name, csv_name) == False:
        err = dataset.error_message()
        t.error(
            f'export_csv({collection_name}, {frame_name}, {csv_name} should have emitted warnings, not error, {err}'
        )
        return
    with open(csv_name, mode='r', encoding='utf-8') as f:
        rows = f.read()

    for row in rows.split('\n'):
        if len(row) > 0:
            cells = row.split(',')
            if len(cells) < 5:
                t.error(f'row error {csv_name} for {cells}')
Exemple #16
0
def test_frame_objects(t, c_name):
    if dataset.status(c_name) == True:
        dataset.close(c_name)
        if os.path.exists(c_name):
            shutil.rmtree(c_name)
    if dataset.init(c_name) == False:
        err = dataset.error_message()
        t.error(f'init({c_name}), {err}')
        return
    data = [{
        "id":
        "A",
        "nameIdentifiers": [{
            "nameIdentifier": "0000-000X-XXXX-XXXX",
            "nameIdentifierScheme": "ORCID",
            "schemeURI": "http://orcid.org/"
        }, {
            "nameIdentifier": "H-XXXX-XXXX",
            "nameIdentifierScheme": "ResearcherID",
            "schemeURI": "http://www.researcherid.com/rid/"
        }],
        "two":
        22,
        "three":
        3.0,
        "four": ["one", "two", "three"]
    }, {
        "id": "B",
        "two": 2000,
        "three": 3000.1
    }, {
        "id": "C"
    }, {
        "id":
        "D",
        "nameIdentifiers": [{
            "nameIdentifier": "0000-000X-XXXX-XXXX",
            "nameIdentifierScheme": "ORCID",
            "schemeURI": "http://orcid.org/"
        }],
        "two":
        20,
        "three":
        334.1,
        "four": []
    }]
    keys = []
    dot_paths = [
        "._Key", ".nameIdentifiers", ".nameIdentifiers[:].nameIdentifier",
        ".two", ".three", ".four"
    ]
    labels = [
        "id", "nameIdentifiers", "nameIdentifier", "two", "three", "four"
    ]
    for row in data:
        key = row['id']
        keys.append(key)
        err = dataset.create(c_name, key, row)
    f_name = 'f1'
    if dataset.frame_create(c_name, f_name, keys, dot_paths, labels) == False:
        err = dataset.error_message()
        t.error(
            f'frame_create({c_name}, {f_name}, {keys}, {dot_paths}, {labels}), {err}'
        )
        return
    f_keys = dataset.frame_keys(c_name, f_name)
    if len(f_keys) != len(keys):
        t.error(f'expected {len(keys)}, got {len(f_keys)}')
    if dataset.frame_refresh(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'frame_reframe({c_name}, {f_name}), {err}')
    l = dataset.frames(c_name)
    if len(l) != 1 or l[0] != 'f1':
        t.error(f"expected one frame name, f1, got {l}")
    object_result = dataset.frame_objects(c_name, f_name)
    if len(object_result) != 4:
        t.error(
            f'Did not get correct number of objects back, expected 4 got {len(object_result)}, {object_result}'
        )
    count_nameId = 0
    count_nameIdObj = 0
    for obj in object_result:
        if 'id' not in obj:
            t.error('Did not get id in object')
        if 'nameIdentifiers' in obj:
            count_nameId += 1
            for idv in obj['nameIdentifiers']:
                if 'nameIdentifier' not in idv:
                    t.error('Missing part of object')
        if 'nameIdentifier' in obj:
            count_nameIdObj += 1
            if "0000-000X-XXXX-XXXX" not in obj['nameIdentifier']:
                t.error('Missing object in complex dot path')
    if count_nameId != 2:
        t.error(
            f"Incorrect number of nameIdentifiers elements, expected 2, got {count_nameId}"
        )
    if count_nameIdObj != 2:
        t.error(
            f"Incorrect number of nameIdentifier elements, expected 2, got {count_nameIdObj}"
        )
    if dataset.delete_frame(c_name, f_name) == False:
        err = dataset.error_message()
        t.error(f'delete_frame({c_name}, {f_name}), {err}')
Exemple #17
0
def get_crossref_refs(prefix, done=False, new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "crossref_refs.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    base_url = (
        "https://api.eventdata.crossref.org/v1/[email protected]&source=crossref&obj-id.prefix="
        + prefix)

    collected = dataset.has_key(collection, "captured")

    cursor = ""
    count = 0
    while cursor != None:
        if collected == True:
            date, err = dataset.read(collection, "captured")
            if err != "":
                print("error on read: " + err)
            date = date["captured"]
            print(date)
            url = base_url + "&from-collected-date=" + date
        else:
            url = base_url
        if cursor != "":
            url = url + "&cursor=" + cursor
        print(url)
        r = requests.get(url)
        records = r.json()
        if records["status"] == "failed":
            print(records)
            break
        for rec in records["message"]["events"]:
            # Save results in dataset
            print(count, rec["id"])
            count = count + 1  # Just for prettyness
            if not dataset.create(collection, rec["id"], rec):
                err = dataset.error_message()
                print("Error in saving record: " + err)

        if cursor == records["message"]["next-cursor"]:
            # Catches bug where we get the same curser back at end of results
            break
        if records["message"]["total-results"] > count:
            cursor = records["message"]["next-cursor"]
        else:
            cursor = None

    if collected == True:
        date, err = dataset.read(collection, "captured")
        if err != "":
            print("Error in reading date: " + err)
        date = date["captured"]

        # Check Deleted
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Delete results in dataset
                print("Deleted: ", rec["id"])
                if not dataset.delete(collection, rec["id"]):
                    err = dataset.error_message()
                    print(f"Unexpected error on read: {err}")
            cursor = records["message"]["next-cursor"]

        # Check Edited
        cursor = ""
        while cursor != None:
            del_url = "https://api.eventdata.crossref.org/v1/events/[email protected]&source=crossref"
            full = del_url + "&from-collected-date=" + date + "&cursor=" + cursor
            r = requests.get(full)
            records = r.json()
            for rec in records["message"]["events"]:
                # Update results in dataset
                print("Update: ", rec["id"])
                if not dataset.update(collection, rec["id"], rec):
                    err = dataset.error_message()
                    print(f"Unexpected error on write: {err}")
            cursor = records["message"]["next-cursor"]

    if done:
        date = datetime.date.today().isoformat()
        record = {"captured": date}
        if dataset.has_key(collection, "captured"):
            if not dataset.update(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on update: {err}")
        else:
            if not dataset.create(collection, "captured", record):
                err = dataset.error_message()
                print(f"Unexpected error on create: {err}")
Exemple #18
0
import sys
import json
from py_dataset import dataset
from py_sitetools import mkpage, frontmatter, version_no, Logger

log = Logger(os.getpid())

# Minimal configuration
docs_dir = "docs"
site_dir = "htdocs"
c_name = "boutique.ds"
index_tmpl = "templates/index.tmpl"

# Create our boutique.ds if required
if os.path.exists("boutique.ds") == False:
    dataset.init("boutique.ds")

# crawl docs_dir and ingest files into data collection.
for path, folders, files in os.walk(docs_dir):
    #log.print(f"Processing {path}")
    for filename in files:
        if filename.endswith(".md"):
            f_name = os.path.join(path, filename)
            log.print(f"Ingesting {f_name}")
            metadata = frontmatter(f_name)
            with open(f_name) as f:
                src = f.read()
            if "id" in metadata:
                key = str(metadata["id"])
                if dataset.has_key(c_name, key):
                    err = dataset.update(c_name, key, {
import urllib

#Get access token from WOS sed as environment variable with source token.bash
token = os.environ['WOSTOK']

headers = {'X-ApiKey': token, 'Content-type': 'application/json'}

#Get input
name = input("Enter a WOS author search term (e.g. Mooley K):")
caltech = input("Restrict to Caltech-affiliated papers? Y or N:")
sheet = input("Enter the google sheet ID:")

#Set up collection
collection = name.split()[0] + '.ds'
subprocess.run(['rm', '-rf', collection])
dataset.init(collection)
base_url = 'https://api.clarivate.com/api/wos/?databaseId=WOK'
url = base_url + '&count=100&firstRecord=1'

if caltech == 'Y':
    query = 'AU=(' + name + ') AND OG=(California Institute of Technology)'
else:
    query = 'AU=(' + name + ')'
query = urllib.parse.quote_plus(query)
url = url + '&usrQuery=' + query

print(url)
response = requests.get(url, headers=headers)
response = response.json()
record_count = response['QueryResult']['RecordsFound']
print(record_count, " Records from WOS")
Exemple #20
0
def get_wos_refs(new=True):
    # New=True will download everything from scratch and delete any existing records

    collection = "all_wos.ds"

    if new == True:
        if os.path.exists(collection) == True:
            shutil.rmtree(collection)

    if os.path.isdir(collection) == False:
        ok = dataset.init(collection)
        if ok == False:
            print("Dataset failed to init collection")
            exit()

    # Get access token from WOS sed as environment variable with source token.bash
    token = os.environ["WOSTOK"]

    headers = {"X-ApiKey": token, "Content-type": "application/json"}

    # Run query to get scope of records

    base_url = "https://api.clarivate.com/api/wos/?databaseId=WOK"

    collected = dataset.has_key(collection, "captured")

    if collected == True:
        date = dataset.read(collection, "captured")
        date = date[0]["captured"]
        date = datetime.fromisoformat(date)
        current = datetime.today()
        diff = current - date
        base_url = base_url + "&loadTimeSpan=" + str(diff.days) + "D"

    date = datetime.today().isoformat()
    record = {"captured": date}
    if dataset.has_key(collection, "captured"):
        err = dataset.update(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on update: {err}")
    else:
        err = dataset.create(collection, "captured", record)
        if err != "":
            print(f"Unexpected error on create: {err}")

    query = "OG=(California Institute of Technology)"
    query = urllib.parse.quote_plus(query)
    url = base_url + "&usrQuery=" + query + "&count=100&firstRecord=1"

    response = requests.get(url, headers=headers)
    response = response.json()
    record_count = response["QueryResult"]["RecordsFound"]
    print(record_count, " Records from WOS")
    query_id = response["QueryResult"]["QueryID"]
    try:
        records = response["Data"]["Records"]["records"]["REC"]
    except:
        print(response)
    write_records(records, collection)
    # We have saved the first 100 records
    record_start = 101
    record_count = record_count - 100

    query_url = "https://api.clarivate.com/api/wos/query/"

    while record_count > 0:
        print(record_count)
        print(len(records), "records")
        if record_count > 100:
            url = (
                query_url
                + str(query_id)
                + "?count=100&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            try:
                records = response["Records"]["records"]["REC"]
            except:
                print(response)
            write_records(records, collection)
            record_start = record_start + 100
            record_count = record_count - 100
        else:
            url = (
                query_url
                + str(query_id)
                + "?count="
                + str(record_count)
                + "&firstRecord="
                + str(record_start)
            )
            response = requests.get(url, headers=headers)
            response = response.json()
            records = response["Records"]["records"]["REC"]
            write_records(records, collection)
            record_count = 0

    print("Downloaded all records ")
Exemple #21
0
            for f in files:
                if f != None:
                    os.remove(f)

            ### Need to handle old files


if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description="caltechdata_backup queries the caltechDATA (Invenio 3) API\
    returns data and adds to dataset structure on disk")

    collection = "caltechdata.ds"
    if os.path.isdir(collection) == False:
        err = dataset.init(collection)
        if err != "":
            print(f"Failed on create {err}")
            exit()

    args = parser.parse_args()

    api_url = "https://data.caltech.edu/api/records/"

    # Get the existing records
    current = dataset.keys(collection)
    req = requests.get(api_url)
    data = req.json()

    temp = 'temp'
    if os.path.isdir(temp) == False:
Exemple #22
0
def get_cd_github(new=True):

    collection = "github_records.ds"

    if new == True:
        os.system("rm -rf " + collection)

    if os.path.isdir(collection) == False:
        if not dataset.init(collection):
            print("Dataset failed to init collection")
            exit()

    url = "https://data.caltech.edu/api/records"

    response = requests.get(url + "/?size=1000&q=subjects:GitHub")
    hits = response.json()

    for h in hits["hits"]["hits"]:
        rid = str(h["id"])
        record = h["metadata"]

        result = dataset.has_key(collection, rid)

        if result == False:

            dataset.create(collection, rid, record)

            print("Downloading files for ", rid)

            codemeta = False

            for erecord in record["electronic_location_and_access"]:
                f = download_file(erecord, rid)

                # We're just looking for the zip file
                if f.split(".")[-1] == "zip":
                    zip_files = subprocess.check_output(
                        ["unzip", "-l", f.rstrip()],
                        universal_newlines=True).splitlines()
                    i = 4  # Ignore header
                    line = zip_files[i]
                    while line[0] != "-":
                        split = line.split("/")
                        fname = split[1]
                        if fname == "codemeta.json":
                            sp = line.split("   ")[-1]
                            os.system("unzip -j " + f.rstrip() + " " + sp +
                                      " -d .")
                            codemeta = True
                        i = i + 1
                        line = zip_files[i]
                        # Will only identify codemeta files in root of repo

                # Trash downloaded files - extracted codemeta.json not impacted
                print("Trash " + f)
                os.system("rm " + f)

            if codemeta == True:
                print(collection, rid)
                response = dataset.attach(collection, rid, ["codemeta.json"])
                print("Attachment ", response)
                os.system("rm codemeta.json")
                print("Trash codemeta.json")