Ejemplo n.º 1
0
def resolver_links(source, keys, outfile=None):
    if source.split(".")[-1] == "ds":
        # This generates report
        dot_paths = [".eprint_id", ".official_url"]
        labels = ["eprint_id", "official_url"]
        all_metadata = get_records(dot_paths, "official", source, keys, labels)
        for meta in all_metadata:
            new = replace_string(meta, "official_url", "http://", "https://")
            if new:
                outfile.writerow([meta["eprint_id"], meta["official_url"], new])
    else:
        # This makes changes
        for eprint_id in progressbar(keys, redirect_stdout=True):
            meta = get_eprint(source, eprint_id)
            # Ignore errors where the record doesn't exist
            if meta != None:
                if meta["eprint_status"] not in ["deletion", "inbox"]:
                    new = replace_string(meta, "official_url", "http://", "https://")
                    if new:
                        url = (
                            source
                            + "/rest/eprint/"
                            + str(eprint_id)
                            + "/official_url.txt"
                        )
                        headers = {"content-type": "text/plain"}
                        print(eprint_id)
                        response = requests.put(url, data=new, headers=headers)
                        print(response)
Ejemplo n.º 2
0
def file_mapping(source_collection):
    """Return a dictionary that maps /tindfiles/serve urls to records."""

    mapping = {}

    dot_paths = [".electronic_location_and_access", "._Key"]
    keys = dataset.keys(source_collection)
    metadata = get_records(dot_paths, "files", source_collection, keys)

    for record in metadata:
        # Handle history records where the key is the item and revision
        k = record["_Key"]
        if "-" in k:
            rec_id = k.split("-")[0]
        else:
            rec_id = k

        # Ignore embargoed records
        if "electronic_location_and_access" in record:
            for filev in record["electronic_location_and_access"]:
                url = filev["uniform_resource_identifier"]
                # name = filev['electronic_name'][0]
                if url not in mapping:
                    mapping[url] = rec_id

    return mapping
Ejemplo n.º 3
0
def update_doi(source, keys, outfile=None):
    if source.split(".")[-1] == "ds":
        # This generates report
        dot_paths = [".eprint_id", ".doi", ".related_url"]
        labels = ["eprint_id", "doi", "related_url"]
        all_metadata = get_records(dot_paths, "doi", source, keys, labels)
        for metadata in all_metadata:
            update = decide_doi_update(metadata)
            if update:
                outfile.writerow(update)
    else:
        for eprint_id in progressbar(keys, redirect_stdout=True):
            print(eprint_id)
            meta = get_eprint(source, eprint_id)
            # Ignore errors where the record doesn't exist
            if meta != None:
                update = decide_doi_update(meta)
                if update:
                    url = source + "/rest/eprint/" + str(eprint_id) + "/doi.txt"
                    headers = {"content-type": "text/plain"}
                    doi = update[1].replace("\u200b", "")
                    # Handle invisible charaters in ASM DOIs
                    response = requests.put(url, data=doi, headers=headers)
                    print(response)
Ejemplo n.º 4
0
def special_characters(source, keys, outfile=None):
    replacements = {
        "_0": "₀",
        "_1": "₁",
        "_2": "₂",
        "_3": "₃",
        "_4": "₄",
        "_5": "₅",
        "_6": "₆",
        "_7": "₇",
        "_8": "₈",
        "_9": "₉",
        "_+": "₊",
        "_-": "₋",
        "_a": "ₐ",
        "_e": "ₑ",
        "_o": "ₒ",
        "_x": "ₓ",
        "^0": "⁰",
        "^1": "¹",
        "^2": "²",
        "^3": "³",
        "^4": "⁴",
        "^5": "⁵",
        "^6": "⁶",
        "^7": "⁷",
        "^8": "⁸",
        "^9": "⁹",
        "^+": "⁺",
        "^-": "⁻",
        "^n": "ⁿ",
        "^i": "ⁱ",
        "’": "'",
        "“": '"',
        "”": '"',
    }
    if source.split(".")[-1] == "ds":
        dot_paths = [".eprint_id", ".title", ".abstract"]
        labels = ["eprint_id", "title", "abstract"]
        all_metadata = get_records(dot_paths, "official", source, keys, labels)
        outfile.writerow(
            [
                "eprints_id",
                # "Current Title",
                "Updated Title",
                # "Current Abstract",
                "Updated Abstract",
            ]
        )
        for meta in all_metadata:
            eprint_id = meta["eprint_id"]
            newtitle = replace_character(meta, "title", replacements)
            if "abstract" in meta:
                newabstract = replace_character(meta, "abstract", replacements)
            else:
                newabstract = None
            if outfile:
                if newtitle or newabstract:
                    row = [eprint_id]
                    if newtitle:
                        row += [newtitle]  # [meta["title"], newtitle]
                    else:
                        row += [" ", " "]
                    if newabstract:
                        row += [newabstract]  # [meta["abstract"], newabstract]
                    outfile.writerow(row)
Ejemplo n.º 5
0
def release_files(source, base_url, outfile=None):
    if source.split(".")[-1] == "ds":
        # This generates report
        dot_paths = [
            ".eprint_id",
            ".documents",
            ".date",
            ".eprint_status",
            ".creators.items[0].name.family",
            ".thesis_type",
            ".full_text_status",
        ]
        labels = [
            "eprint_id",
            "documents",
            "date",
            "status",
            "family",
            "type",
            "full_text",
        ]
        keys = dataset.keys(source)
        all_metadata = get_records(dot_paths, "official", source, keys, labels)
        all_metadata.sort(key=lambda all_metadata: all_metadata["family"])
        all_metadata.sort(key=lambda all_metadata: all_metadata["date"])
        for meta in all_metadata:
            year = meta["date"].split("-")[0]
            if is_in_range("2004-2005", year):
                if thesis_match(meta):
                    files = []
                    fnames = []
                    count = 0
                    for document in meta["documents"]:
                        count = count + 1
                        if document["security"] == "validuser":
                            files.append(count)
                            fnames.append(document["main"])
                    if len(files) > 0:
                        eprint_id = meta["eprint_id"]
                        print(eprint_id)
                        outfile.writerow(
                            [
                                year,
                                meta["family"],
                                eprint_id,
                                meta["status"],
                                meta["full_text"],
                                files,
                                fnames,
                            ]
                        )
                        mixed = False
                        for filen in files:
                            new = "public"
                            # Doc status
                            url = (
                                base_url
                                + "/rest/eprint/"
                                + str(eprint_id)
                                + "/full_text_status.txt"
                            )
                            response = requests.get(url)
                            eprint_status = response.text
                            if eprint_status == "restricted":
                                response = requests.put(url, data=new, headers=headers)
                                print(response)
                            elif eprint_status == "mixed":
                                print("mixed, skipping")
                                mixed = True
                            elif eprint_status != "public":
                                print(eprint_status)
                                print(url)
                                exit()
                            url = (
                                base_url
                                + "/rest/eprint/"
                                + str(eprint_id)
                                + "/documents/"
                                + str(filen)
                                + "/security.txt"
                            )
                            headers = {"content-type": "text/plain"}
                            response = requests.get(url)
                            live_status = response.text
                            if not mixed:
                                if live_status == "validuser":
                                    response = requests.put(
                                        url, data=new, headers=headers
                                    )
                                    print(response)
                                elif live_status != "public":
                                    print(live_status)
                                    print(url)
                                    exit()
Ejemplo n.º 6
0
def add_thesis_doi(data_collection, thesis_collection, token, production=True):
    """Add in theis DOI to CaltechDATA records"""

    # Search across CaltechTHESIS DOIs
    dot_paths = ["._Key", ".doi", ".official_url", ".related_url"]
    labels = ["eprint_id", "doi", "official_url", "related_url"]
    keys = dataset.keys(thesis_collection)
    all_metadata = get_records(dot_paths, "dois", thesis_collection, keys, labels)
    dois = []
    for metadata in progressbar(all_metadata, redirect_stdout=True):
        if "doi" in metadata:
            record_doi = metadata["doi"].strip()
            if "related_url" in metadata and "items" in metadata["related_url"]:
                items = metadata["related_url"]["items"]
                for item in items:
                    if "url" in item:
                        url = item["url"].strip()
                    if "type" in item:
                        itype = item["type"].strip().lower()
                    if itype == "doi":
                        if idutils.is_doi(url):
                            doi = "10." + url.split("10.")[1]
                            prefix = doi.split("/")[0]
                            if prefix == "10.22002":
                                dois.append([doi, record_doi])
                        else:
                            print("Ignoring non-DOI")
                            print(metadata["eprint_id"])
                            print(url.split("10."))
    for doi_link in dois:
        cd_doi = doi_link[0]
        thesis_doi = doi_link[1]
        print("Checking " + cd_doi)
        if "D1" in cd_doi:
            record_number = cd_doi.split("D1.")[1]
        if "d1" in cd_doi:
            record_number = cd_doi.split("d1.")[1]
        record, err = dataset.read(data_collection, record_number)
        if err != "":
            print(err)
            exit()

        done = False
        if "relatedIdentifiers" in record:
            for idv in record["relatedIdentifiers"]:
                identifier = idv["relatedIdentifier"]
                if identifier == thesis_doi:
                    done = True
            if done == False:
                identifiers = record["relatedIdentifiers"]
                identifiers.append(
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                )
                new_metadata = {"relatedIdentifiers": identifiers}
        else:
            new_metadata = {
                "relatedIdentifiers": [
                    {
                        "relatedIdentifier": thesis_doi,
                        "relatedIdentifierType": "DOI",
                        "relationType": "IsSupplementTo",
                    }
                ]
            }
        if done == False:
            print("Adding " + thesis_doi + " to " + cd_doi)
            response = caltechdata_edit(
                token, record_number, new_metadata, {}, {}, True
            )
            print(response)
Ejemplo n.º 7
0
    if err != "":
        print(err)

    people_list = dataset.keys(import_coll)
    people = []
    for p in people_list:
        record, err = dataset.read(import_coll, p)
        people.append(record)

    # Profiles collection from feeds
    profile_ds = "profiles.ds"
    keys = dataset.keys(profile_ds)
    labels = ["orcid", "creator_id"]
    dot_paths = [".orcid", ".creator_id"]

    all_metadata = get_records(dot_paths, "profile", profile_ds, keys, labels)
    for profile in all_metadata:
        if "creator_id" in profile:
            idv = profile["creator_id"]
        else:
            print("ERROR", profile)
        for person in people:
            if person["Authors_ID"] != "":
                if person["Authors_ID"] == idv:
                    if person["ORCID"] == "":
                        person["ORCID"] = profile["orcid"]
                        dataset.update(import_coll, person["CL_PEOPLE_ID"], person)
                        print("Updated ", person["CL_PEOPLE_ID"])
                    elif person["ORCID"] != profile["orcid"]:
                        print(
                            "Inconsistent ORCIDS for ",
Ejemplo n.º 8
0
def agent_report(file_name, repo, aspace):
    dot_paths = [
        "._Key",
        ".directory_info",
        ".ORCID",
        ".sort_name",
        ".ArchivesSpace_ID",
        ".family",
        ".given",
    ]
    labels = ["id", "directory_info", "orcid", "name", "as", "family", "given"]
    source = get_caltechfeed("people")
    keys = dataset.keys(source)
    keys.remove("captured")

    all_metadata = get_records(dot_paths, "p_list", source, keys, labels)

    all_metadata.sort(key=lambda all_metadata: all_metadata["id"])

    fname = file_name.split(".")[0]
    fcaltechpeople = fname + "_caltechpeople.csv"
    fmatched = fname + "_matched.csv"
    fnew_caltechpeople = fname + "_newcaltechpeople.csv"
    fnew_aspace = fname + "_newaspace.csv"

    caltechpeople = csv.writer(open(fcaltechpeople, "w"))
    matched = csv.writer(open(fmatched, "w"))
    new_caltechpeople = csv.writer(open(fnew_caltechpeople, "w"))
    new_aspace = csv.writer(open(fnew_aspace, "w"))

    to_match = {}
    gen_match = {}
    already_matched = {}

    aspace_url = "https://collections.archives.caltech.edu/agents/people/"
    feeds_url = "https://feeds.library.caltech.edu/people/"

    for metadata in all_metadata:
        if "as" in metadata:
            if metadata["as"] != "":
                already_matched[metadata["as"]] = metadata
            else:
                to_match[metadata["name"]] = metadata
                gen_match[metadata["family"]] = metadata
    print(f"{len(already_matched)} agents already in CaltechPEOPLE")

    print(f"Requesting agents")
    for agent in progressbar(aspace.agents):
        if agent.agent_type == "agent_person":
            primaty_name = agent.display_name.primary_name
            name = agent.display_name.sort_name
            published = agent.publish
            uid = int(agent.uri.split("/")[-1])
            if uid not in already_matched:
                if name in to_match:
                    person = to_match[name]
                    matched.writerow([
                        person["name"],
                        uid,
                        aspace_url + str(uid),
                        person["id"],
                        feeds_url + person["id"],
                        published,
                    ])
                    to_match.pop(name)
                else:
                    new_caltechpeople.writerow(
                        [name, uid, aspace_url + str(uid), published])
            else:
                metadata = already_matched[uid]
                caltechpeople.writerow([
                    metadata["name"],
                    metadata["as"],
                    aspace_url + str(metadata["as"]),
                    metadata["id"],
                    feeds_url + metadata["id"],
                    published,
                ])

    for name in to_match:
        new_aspace.writerow(
            [name, to_match[name]["id"], feeds_url + to_match[name]["id"]])