def wikidata_entity_request( qids: List[str], language_keys: Optional[List[str]] = lang_keys, props: Optional[List[str]] = [ CLAIMS, DESCRIPTION[PLURAL], LABEL[PLURAL], SITELINKS, ], timeout: Optional[int] = TIMEOUT, sleep_time: Optional[int] = SLEEP_TIME, maxlag: Optional[int] = MAX_LAG, ) -> Dict: """Represents an wikidata entity request for a list of qids The API specifies that 50 items can be loaded at once without needing additional permissions: https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities Args: qids: List of qids language_keys: All language keys which should be extracted. Defaults to languageconfig.csv props: Properties of the entity request. Defaults to [CLAIMS, DESCRIPTION[PLURAL], LABEL[PLURAL], SITELINKS] timeout: Timeout for the queries. Defaults to TIMEOUT sleep_time: Sleep time if errors occur. Defaults to SLEEP_TIME maxlag: Maxlag for the wikidata server see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter. Defaults to MAX_LAG Returns: Raw wikidata response for the requested entities Examples: wikidata_entity_request(["Q12418", "Q45585"]) """ initial_timeout = timeout langkeyPlusWikiList = [key + "wiki" for key in language_keys] parameters = { "action": "wbgetentities", "ids": "|".join(qids), "format": JSON, "languages": "|".join(language_keys), "sitefilter": "|".join(langkeyPlusWikiList), "props": "|".join(props), "redirects": "no", # if the server needs more than maxlag seconds to process # the query an error response is returned "maxlag": maxlag, } url = WIKIDATA_API_URL return send_http_request( parameters, HTTP_HEADER, url, logger, initial_timeout=initial_timeout, items=qids, timeout=timeout, sleep_time=sleep_time, maxlag=maxlag, )
def get_wikipedia_page_ids( items: List[Dict], indices: List[int], langkey: str, timeout: Optional[int] = TIMEOUT, sleep_time: Optional[int] = SLEEP_TIME, maxlag: Optional[int] = MAX_LAG, ) -> Dict: """Function to get the wikipedia page ids from their label referenced in the sitelinks https://en.wikipedia.org/w/api.php?action=help&modules=query sitelink de: Mona_Lisa is resolved to Args: items: List of items indices: A list of indices which contain a sitelink langkey: A specific language key e. g. 'en' timeout: Timeout on the request. Defaults to TIMEOUT. sleep_time: Waiting time if there are serverside problems. Defaults to SLEEP_TIME. maxlag: Maxlag for the wikidata server see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter. Defaults to MAX_LAG Returns: A dictionary which maps the wikipedia page id (which is not a qid like in wikidata) to an index in the items dictionary Source: https://stackoverflow.com/questions/52787504/how-to-get-page-id-from-wikipedia-page-title """ title_indice_dictionary = {} wikipedia_url = f"https://{langkey}.wikipedia.org/wiki/" for index in indices: title_indice_dictionary.update( { items[index][f"{WIKIPEDIA_LINK}_{langkey}"].replace( wikipedia_url, "" ): index } ) parameters = { "action": "query", "format": JSON, "prop": "info", "titles": "|".join(title_indice_dictionary.keys()), # if the server needs more than maxlag seconds to answer # the query an error response is returned "maxlag": maxlag, } url = f"https://{langkey}.wikipedia.org/w/api.php" response = send_http_request( parameters, HTTP_HEADER, url, logger, items=title_indice_dictionary.keys(), timeout=TIMEOUT, sleep_time=SLEEP_TIME, maxlag=MAX_LAG, ) page_normalized_titles = {x: x for x in title_indice_dictionary.keys()} # map index of json array to page id of wikipedia item_page_id_index_dictionary = {} if "normalized" in response["query"]: for mapping in response["query"]["normalized"]: page_normalized_titles[mapping["to"]] = mapping["from"] for page_id, page_info in response["query"]["pages"].items(): normalized_title = page_info["title"] page_title = page_normalized_titles[normalized_title] index = title_indice_dictionary[page_title] item_page_id_index_dictionary[page_id] = index return item_page_id_index_dictionary
def get_wikipedia_extracts( items: List[Dict], page_id_index_dictionary: Dict, langkey: str, timeout: Optional[int] = TIMEOUT, sleep_time: Optional[int] = SLEEP_TIME, maxlag: Optional[int] = MAX_LAG, ): """Get the wikipedia extracts (in our data model they're called abstracts) https://en.wikipedia.org/w/api.php?action=help&modules=query Args: items: List of entities page_id_index_dictionary: Dictionary to resolve the page ids from the indices in the items list langkey: A specific language key e. g. 'en' timeout: Timeout on the request. Defaults to TIMEOUT. sleep_time: Waiting time if there are serverside problems. Defaults to SLEEP_TIME. maxlag: Maxlag for the wikidata server see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter. Defaults to MAX_LAG Returns: A dictionary with index and abstract which is added to the entity of the index later Source: https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370 """ parameters = { "action": "query", "format": JSON, "prop": "extracts", "exintro": True, "explaintext": True, "pageids": "|".join(page_id_index_dictionary.keys()), # if the server needs more than maxlag seconds to answer # the query an error response is returned "maxlag": maxlag, } # Send HTTP-Request url = f"https://{langkey}.wikipedia.org/w/api.php" response = send_http_request( parameters, HTTP_HEADER, url, logger, items=page_id_index_dictionary.keys(), abstracts=True, timeout=TIMEOUT, sleep_time=SLEEP_TIME, maxlag=MAX_LAG, ) index_extract_dictionary = {} for page_id, index in page_id_index_dictionary.items(): if int(page_id) < 0: print( "For the wikidata item {0} there was no pageid found on the {1}.wikipedia site. Therefore the extract is set to an empty string now".format( items[index]["id"], langkey ) ) # Return empty extract for those cases index_extract_dictionary[index] = "" continue index_extract_dictionary[index] = response["query"]["pages"][page_id]["extract"] return index_extract_dictionary