from SPARQLWrapper import SPARQLWrapper from data_extraction import map_wd_attribute, map_wd_response from data_extraction.constants import * from data_extraction.request_utils import send_http_request from shared.blocklist import BLOCKLIST from shared.utils import ( chunks, language_config_to_list, setup_logger, ) logger = setup_logger( "data_extraction.load_wd_entities", Path(__file__).parent.parent.absolute() / "logs" / GET_WIKIDATA_ITEMS_LOG_FILENAME, ) lang_keys = [item[0] for item in language_config_to_list()] def query_artwork_qids(type_name: str, wikidata_id: str) -> List[str]: """Extracts all artwork QIDs from the wikidata SPARQL endpoint https://query.wikidata.org/ Args: type_name: type name to extract from, only relevant for console output wikidata_id: wikidata qid related to the given type name Returns: A list of all qids of the provided wikidata_id
import datetime import json import sys from pathlib import Path from typing import Dict, List, Optional from data_extraction.constants import * from data_extraction.request_utils import send_http_request from shared.constants import JSON from shared.utils import chunks, create_new_path, language_config_to_list, setup_logger, check_state, write_state RECOVER_MODE = False logger = setup_logger( "data_extraction.get_wikipedia_extracts", Path(__file__).parent.parent.absolute() / "logs" / GET_WIKIPEDIA_EXTRACS_LOG_FILENAME, ) lang_keys = [item[0] for item in language_config_to_list()] def get_wikipedia_page_ids( items: List[Dict], indices: List[int], langkey: str, timeout: Optional[int] = TIMEOUT, sleep_time: Optional[int] = SLEEP_TIME, maxlag: Optional[int] = MAX_LAG, ) -> Dict: """Function to get the wikipedia page ids from their label referenced in the sitelinks
from typing import List, Dict, Set, Iterator, Optional, Callable from data_extraction import map_wd_attribute from data_extraction import map_wd_response from data_extraction.constants import * from data_extraction.request_utils import send_http_request from shared.utils import chunks, create_new_path, language_config_to_list, setup_logger from shared.constants import JSON, CSV from SPARQLWrapper import SPARQLWrapper DEV = False DEV_CHUNK_LIMIT = 2 # Not entry but chunks of 50 logger = setup_logger( "data_extraction.get_wikidata_items", Path(__file__).parent.parent.absolute() / "logs" / GET_WIKIDATA_ITEMS_LOG_FILENAME, ) lang_keys = [item[0] for item in language_config_to_list()] def query_artwork_qids(type_name: str, wikidata_id: str) -> List[str]: """Extracts all artwork QIDs from the wikidata SPARQL endpoint https://query.wikidata.org/ Args: type_name: type name to extract from, only relevant for console output wikidata_id: wikidata qid related to the given type name Returns: A list of all qids of the provided wikidata_id
"""Mapping functions to extract information from wikidata JSON responses (especially entity attribtues) to the openArtBrowser data model """ import inspect import re from pathlib import Path from typing import Any, Callable, Dict, List from pywikibot import WbTime from data_extraction.constants import * from shared.utils import setup_logger logger = setup_logger( "data_extraction.map_wd_attribute", Path(__file__).parent.parent.absolute() / "logs" / WIKIDATA_MAP_ATTRIBUTE_LOG_FILENAME, ) def get_attribute_values_with_try_get_func( entity_dict: Dict, attribute_list: List, oab_type: str, try_get_func: Callable[[Dict, str, str, str], Any], ) -> Any: """Higher order function for map_wd_attribute function to bundle calls in for-loops Args: result: JSON response from wikidata attribute_list: attributes to extract with function oab_type: type name which is extracted
"""Functions to map a wikidata entity response to an openArtBrowser model """ from pathlib import Path from typing import Dict, List, Optional import data_extraction.map_wd_attribute as map_wd_attribute from data_extraction.constants import * from shared.utils import language_config_to_list, setup_logger logger = setup_logger( "data_extraction.map_wd_response", Path(__file__).parent.parent.absolute() / "logs" / WIKIDATA_MAP_RESPONSE_LOG_FILENAME, ) lang_keys = [item[0] for item in language_config_to_list()] def try_map_response_to_subject( response: Dict, type_name: str, language_keys: Optional[List[str]] = lang_keys, ) -> Dict: """Maps the default attributes which every subject has: qid, image, label, description, classes, wikipediaLink (including language specific attributes) Args: response: The wikidata entity which should be mapped to an openArtBrowser entity type_name: Type name of the entity language_keys: All language keys which should be extracted. Defaults to languageconfig.csv
import json import sys from pathlib import Path from typing import Dict, List, Optional import requests from shared.constants import (JSON, ARTWORK, ARTIST, MOVEMENT, PLURAL, ID, VIDEOS, YOUTUBE_VIDEOS_FILE, ETL_STATES, ADD_YOUTUBE_VIDEOS_LOG_FILENAME) from shared.utils import create_new_path, write_state, check_state, setup_logger # setup logger logger = setup_logger( "data_enhancement.add_youtube_videos", Path(__file__).parent.parent.absolute() / "logs" / ADD_YOUTUBE_VIDEOS_LOG_FILENAME, ) try: GOOGLE_DEV_KEY = open("google_dev_key.txt").read() except FileNotFoundError: GOOGLE_DEV_KEY = "" RECOVER_MODE = False def check_yt_id_valid(id: str) -> bool: """Connects to the YT API and checks if the is valid Args:
Returns: The ranked entities which means the attributes absolute rank and relative rank. """ import json import datetime from numbers import Number from typing import List, Dict from shared.constants import * from shared.utils import create_new_path, write_state, check_state, setup_logger from pathlib import Path import sys # setup logger logger = setup_logger( "data_enhancement.ranking", Path(__file__).parent.parent.absolute() / "logs" / RANKING_LOG_FILENAME, ) RECOVER_MODE = False def rank_artworks( artworks: List[Dict], ignore_keys: List[str] = [ABSOLUTE_RANK, RELATIVE_RANK]) -> List[Dict]: """Ranks a list of artwork entities (JSON-Objects) Args: artworks: List of artworks ignore_keys: Keys within the artwork entities which have to be ignored. Defaults to [ABSOLUTE_RANK, RELATIVE_RANK]. Returns: