Example #1
0
def load_geonames(filename: str) -> None:
    """Load geonames data for keys in json file.

    Args:
        filename: Filename to load
    """

    # Load file
    with open(
            filename,
            encoding="utf-8",
    ) as file:
        placenames = json.load(file)
    del placenames["$schema"]
    placenames = cast(TranslationDictCleanedPlacenames, placenames)

    # Load data
    for value in placenames.values():
        print(f"Looking for data for {value['en_GB']}")
        geoname = geocoder.geonames(value["en_GB"], key="danielnoord")
        geoname_details = geocoder.geonames(geoname.geonames_id,
                                            key="danielnoord",
                                            method="details")
        value["geonames_id"] = geoname.geonames_id
        if geoname_details.wikipedia:
            value[
                "geonames_wikipedia"] = f"https://{geoname_details.wikipedia}"
        else:
            value["geonames_wikipedia"] = None

    # Re-add schema
    placenames["$schema"] = "../static/JSON/Placenames.json"

    write_single_json_file(placenames, "outputs/Translations",
                           "Placenames.json")
def search_isni_api(database: IndividualsDictCleaned) -> None:
    """Checks name and surname pairs and sees if they match with ISNI identifiers."""
    for data in database.values():
        if not data.get("ISNI:id", None):
            name = f"{data['name']} {data['surname']}".replace(" ", "+")
            response = requests.get(
                f"http://isni.oclc.org/sru/?query=pica.nw+%3D+%22{name}%22&operation=searchRetrieve&recordSchema=isni-b&maximumRecords=10"  # pylint: disable=line-too-long
            )

            records = list(
                ElementTree.fromstring(response.content).iter(
                    "{http://www.loc.gov/zing/srw/}record"))

            if records:
                print("\n", data["name"], data["surname"])

            for record in records:
                uri = list(record.iter("isniURI"))[0].text
                try:
                    forename = list(record.iter("forename"))[0].text
                except IndexError:
                    forename = None
                try:
                    surname = list(record.iter("surname"))[0].text
                except IndexError:
                    surname = None
                print(forename, surname)
                print(uri)

    database[
        "$schema"] = "../static/JSON/Individuals.json"  # type: ignore[assignment]
    write_single_json_file(database, "outputs", "Individuals.json")
def sort_database(filename: str) -> None:
    """Sorts the entries in a database.

    Args:
        filename: File name of initial database
    """
    with open(filename, encoding="utf-8") as file:
        persons = json.load(file)
    del persons["$schema"]

    for identifier, data in persons.items():
        if len(data["sources"]) > 1 and data["sources"] != sorted(
                data["sources"]):
            persons[identifier]["sources"] = sorted(data["sources"])
            print(f"Sorted sources for {identifier}")
        if len(data["sources_other"]) > 1 and data["sources_other"] != sorted(
                data["sources_other"]):
            persons[identifier]["sources_other"] = sorted(
                data["sources_other"])
            print(f"Sorted sources_other for {identifier}")
        try:
            if len(data["titles"]) > 1 and data["titles"] != sorted(
                    data["titles"], key=lambda x: (x[1] is not None, x[1])):
                persons[identifier]["titles"] = sorted(
                    data["titles"], key=lambda x: (x[1] is not None, x[1]))
                print(f"Sorted titles for {identifier}")
        except IndexError as error:
            raise IndexError(
                f"Something wrong with the titles of {identifier}. Error: {error}"
            ) from error
        try:
            if len(data["functions"]) > 1 and data["functions"] != sorted(
                    data["functions"], key=lambda x: (x[1] is not None, x[1])):
                persons[identifier]["functions"] = sorted(
                    data["functions"], key=lambda x: (x[1] is not None, x[1]))
                print(f"Sorted functions for {identifier}")
        except IndexError as error:
            raise IndexError(
                f"Something wrong with the functions of {identifier}. Error: {error}"
            ) from error

    persons["$schema"] = "../static/JSON/Individuals.json"

    write_single_json_file(persons, "outputs", "Individuals.json")
Example #4
0
def update_placenames_with_geonames(filename: str) -> None:
    """Pull data from geonames and populate our database with it."""
    # Load file
    with open(
            filename,
            encoding="utf-8",
    ) as file:
        placenames = json.load(file)
    del placenames["$schema"]
    placenames = cast(TranslationDictCleanedPlacenames, placenames)

    # Load data
    for value in placenames.values():
        if value["geonames_id"] is None:
            raise ValueError(f"{value['en_GB']} doesn't have a Geonames ID")
        geoname = geocoder.geonames(value["geonames_id"],
                                    method="details",
                                    key="danielnoord")
        if geoname.feature_class not in {
                "P",
                "T",
                "H",
                "S",
        }:  # Places, islands, seas or estates
            raise ValueError(
                f"""Geonames ID for {value['en_GB']} is not a place, island or sea.
                Please check https://www.geonames.org/{value['geonames_id']}"""
            )

        # Populate fields
        value["latitude"] = geoname.lat
        value["longitude"] = geoname.lng
        if geoname.wikipedia:
            value["geonames_wikipedia"] = f"https://{geoname.wikipedia}"
        else:
            value["geonames_wikipedia"] = None

    # Re-add schema
    placenames["$schema"] = "../../static/JSON/Placenames.json"

    write_single_json_file(placenames, "outputs", "Placenames.json")
def check_all_sources(filename: str, ) -> None:
    """Check and update all sources for given database.

    Args:
        filename: File name of initial database
    """
    with open(filename, encoding="utf-8") as file:
        persons = json.load(file)
    del persons["$schema"]

    source_patterns = []
    with open("inputs/SourcePatterns.json", encoding="utf-8") as file:
        source_types = json.load(file)
        for sources in source_types.values():
            source_patterns += sources

    count_todo = 0
    probably_wrong: list[str] = []
    compiled_source_patterns = [re.compile(f"{i}$") for i in source_patterns]
    used_patterns: set[re.Pattern[str]] = set()

    for identifier, data in persons.items():

        (
            data["sources"],
            used_patterns,
            count_todo,
            probably_wrong,
        ) = check_sources_entry(
            data["sources"],
            compiled_source_patterns,
            used_patterns,
            identifier,
            count_todo,
            probably_wrong,
        )
        (
            data["sources_other"],
            used_patterns,
            count_todo,
            probably_wrong,
        ) = check_sources_entry(
            data["sources_other"],
            compiled_source_patterns,
            used_patterns,
            identifier,
            count_todo,
            probably_wrong,
        )

    persons["$schema"] = "../static/JSON/Individuals.json"

    # Write new file if this file itself is run
    if __name__ == "__main__":
        write_single_json_file(persons, "outputs", "Individuals.json")
    if probably_wrong:
        print("\nThese sources might be wrong")
        print(
            "They have not been added to the list in python/json_check_sources.py"
        )
        print(
            r"However, that list is awful anyway and is in dire need of updating :')"
        )
        for i in probably_wrong:
            print("", i)
    if unused_patterns := [
            i for i in compiled_source_patterns if not i in used_patterns
    ]:
        print(
            f"Found the following unused source patterns:\n {unused_patterns}")
def convert_wikidata_to_isni(database: IndividualsDictCleaned) -> None:
    """Checks wikidata identifiers and sees if they can be converted to ISNI identifiers."""
    for data in database.values():
        if not data.get("ISNI:id", None) and data.get("wikidata:id", None):
            wikidata = wdi_core.WDItemEngine(
                wd_item_id=data["wikidata:id"]).get_wd_json_representation()
            if isni_data := wikidata["claims"].get("P213", None):
                data["ISNI:id"] = isni_data[0]["mainsnak"]["datavalue"][
                    "value"]
            else:
                data["ISNI:id"] = None

    database[
        "$schema"] = "../static/JSON/Individuals.json"  # type: ignore[assignment]
    write_single_json_file(database, "outputs", "Individuals.json")


def search_isni_api(database: IndividualsDictCleaned) -> None:
    """Checks name and surname pairs and sees if they match with ISNI identifiers."""
    for data in database.values():
        if not data.get("ISNI:id", None):
            name = f"{data['name']} {data['surname']}".replace(" ", "+")
            response = requests.get(
                f"http://isni.oclc.org/sru/?query=pica.nw+%3D+%22{name}%22&operation=searchRetrieve&recordSchema=isni-b&maximumRecords=10"  # pylint: disable=line-too-long
            )

            records = list(
                ElementTree.fromstring(response.content).iter(
                    "{http://www.loc.gov/zing/srw/}record"))
def save_database(  # pylint: disable=too-many-locals
        filename: str,
        previous_database: Optional[IndividualsDict] = None) -> None:
    """Load database from .docx and write .json.

    Args:
        filename: Filename of the input file
        previous_database: Dict with data from previous database
    """
    doc = docx.Document(filename)
    all_individuals: IndividualsDict = {}

    # TODO: This does not work currently and removes fields we are using
    for para in doc.paragraphs:
        (
            identifier,
            person_type,
            surname,
            name,
            date_of_birth,
            place_of_birth,
            date_of_death,
            place_of_death,
            titles,
            functions,
            comment,
            comment_daniel,
            sources,
            images,
        ) = re.split(r"\n.*?: ", para.text)

        person_type = int(person_type)
        titles = parse_title(titles)
        functions = parse_function(functions)
        sources = sources.replace("\n", "").split("| ")
        images = images.replace("\n", "").split("| ")
        all_individuals[identifier] = {  # type: ignore[assignment]
            "surname": surname,
            "person_type": person_type,
            "name": name,
            "date_of_birth": date_of_birth,
            "place_of_birth": place_of_birth,
            "date_of_death": date_of_death,
            "place_of_death": place_of_death,
            "titles": titles,
            "functions": functions,
            "comment": comment,
            "comment_daniel": comment_daniel,
            "sources": sources,
            "images": images,
        }
    if previous_database:
        all_individuals = previous_database | all_individuals

    # Sort and Schema, shouldn't sort a dict but oh well..
    all_individuals = dict(
        sorted(all_individuals.items(), key=lambda item: item[0]))
    all_individuals = {
        "$schema": "../static/JSON/Individuals.json"
    } | all_individuals

    write_single_json_file(all_individuals, "outputs", "Individuals.json")