def get_sameas_links_from_external_id(pid: str, formatter_url: str = None ) -> pd.DataFrame: """ Get sameAs links between Wikidata and another database using its external identifier PID. Args: pid (str): PID for an external identifier formatter_url (str, optional): URL to map IDs to full URLs, with $1 in place of the ID, e.g. "https://collection.sciencemuseum.org.uk/$1". Returns: pd.DataFrame: columns wikidata_url, external_url """ if formatter_url is None: # get formatter URL query = f"""SELECT * WHERE {{ wd:{pid} wdt:P1630 ?url }} """ res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)["results"]["bindings"] if len(res) == 0: raise ValueError( "No formatter URL found. Specify it in the `formatter_url` argument to this function instead." ) formatter_url = res[0]["url"]["value"] elif "$1" not in formatter_url: raise ValueError( "Argument formatter_url must contain $1, describing where the ID appears." ) # get wikidata urls and internal IDs for PID query = f"""SELECT * WHERE {{ ?wiki_url wdt:{pid} ?external_id . }}""" res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)["results"]["bindings"] if len(res) > 0: res_df = pd.json_normalize(res)[[ "wiki_url.value", "external_id.value" ]].rename( columns={ "wiki_url.value": "wikidata_url", "external_id.value": "external_url", }) res_df["external_url"] = res_df["external_url"].apply( lambda i: formatter_url.replace("$1", i)) return res_df
def _get_unlabelled_records_from_sparql_store(self, limit: int = None ) -> Iterable[dict]: """ Get all records without an owl:sameAs value (URIs and labels) from the Fuseki instance. Args: limit (int, optional): Defaults to None. Returns: Generator of dicts. Each dict has the form {"id": __, "label": ___} """ query = f"""SELECT DISTINCT ?item ?itemLabel WHERE {{ FILTER NOT EXISTS {{?item owl:sameAs ?object}}. ?item rdfs:label ?itemLabel. {self._get_type_constraint()} {self.extra_sparql_lines} ?item skos:hasTopConcept '{self.table_name}'. }}""" if limit is not None: query = query + f"LIMIT {limit}" res = get_sparql_results(config.FUSEKI_ENDPOINT, query)["results"]["bindings"] return ({ "id": item["item"]["value"], "label": item["itemLabel"]["value"] } for item in res)
def test_propertylookup_query(): query = """ SELECT ?item ?itemLabel ?itemDescription ?altLabel ?P570Label ?P569Label WHERE { VALUES (?item) { (wd:Q106481) (wd:Q46633) } OPTIONAL{ ?item wdt:P570 ?P570 .} OPTIONAL{ ?item wdt:P569 ?P569 .} OPTIONAL { ?item skos:altLabel ?altLabel . FILTER (lang(?altLabel) = "en") } SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } """ res = get_sparql_results(endpoint, query) # one result for each entity assert len(res["results"]["bindings"]) == 2 # one column for each value in the SELECT slug assert len(res["results"]["bindings"][0]) == 6
def test_shortestpath_query(): query = """PREFIX gas: <http://www.bigdata.com/rdf/gas#> SELECT ?super (?aLength + ?bLength as ?length) WHERE { SERVICE gas:service { gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ; gas:in wd:Q22687 ; gas:traversalDirection "Forward" ; gas:out ?super ; gas:out1 ?aLength ; gas:maxIterations 10 ; gas:linkType wdt:P279 . } SERVICE gas:service { gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ; gas:in wd:Q43229 ; gas:traversalDirection "Forward" ; gas:out ?super ; gas:out1 ?bLength ; gas:maxIterations 10 ; gas:linkType wdt:P279 . } } ORDER BY ?length LIMIT 1""" res = get_sparql_results(endpoint, query) assert int(float(res["results"]["bindings"][0]["length"]["value"])) == 2
def test_get_sparql_results(self): endpoint = "https://query.wikidata.org/sparql" query = "SELECT ?s ?p ?o WHERE {?s ?p ?o} LIMIT 20" res = sparql.get_sparql_results(endpoint, query, add_prefixes=True) assert "results" in res assert "bindings" in res["results"]
def lookup_wikidata_id(self, pid: str, uid: str) -> str: """ Lookup UID on Wikidata against given property ID of source Args: pid (str): Property ID of source (e.g. OxDnB ID: P1415) uid (str): Value of source ID (e.g. an OxDnB ID: 23105) Returns: qcode: Wikidata qcode in format Q(d+) """ endpoint_url = config.WIKIDATA_SPARQL_ENDPOINT query = f""" SELECT ?item ?itemLabel WHERE {{ ?item wdt:{pid} "{uid}". SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }} }} """ res = get_sparql_results(endpoint_url, query) if res: wikidata = res["results"]["bindings"] if wikidata and wikidata[0]: wikidata_url = wikidata[0]["item"]["value"] wikidata_id = re.findall(r"(Q\d+)", wikidata_url)[0] return wikidata_id
def _get_subject_items_from_pid(pid: str) -> list: """ Gets a list of subject items from a Wikidata property ID using 'subject item of this property (P1629)'. If a URL is passed extracts the PID from it if it exists, else raises a ValueError. """ if pid.startswith("http"): logger.warning( "WARNING: URL instead of PID entered. Converting to PID") pids = re.findall(r"(P\d+)", pid) if len(pids) == 1: pid = pids[0] else: raise ValueError("URL not a valid property URL.") query = f""" SELECT ?property WHERE {{ wd:{pid} wdt:P1629 ?property. }} """ res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query) if "results" in res: bindings = res["results"]["bindings"] qids = [url_to_qid(item["property"]["value"]) for item in bindings] return qids else: return []
def get_wikidata_equivalents_for_properties( properties: List[str], raise_missing=False, warn_missing=True ) -> dict: """ Get Wikidata equivalents for RDF properties. Args: properties (List[str]): list of URIs of properties raise_missing (bool, optional): If True, raises a ValueError if Wikidata equivalents can't be found for any of the specified properties. Defaults to False. warn_missing (bool, optional): If True, logs a warning if Wikidata equivalents can't be found for any of the specified properties. Defaults to True. Returns: dict: {property: wikidata_value, ...}. Any properties that don't have a corresponding Wikidata value will have value None unless they are already a Wikidata property value, in which case their key is the same as their value. """ wiki_properties = [p for p in properties if p.startswith(str(WDT))] lookup_properties = list(set(properties) - set(wiki_properties)) values_slug = " ".join(["<" + uri + ">" for uri in lookup_properties]) query = f"""SELECT * WHERE {{ VALUES ?internal_property {{ {values_slug} }}. ?wiki_property wdt:P1628 ?internal_property. }}""" res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)["results"][ "bindings" ] internal_wikidata_mapping = { item["internal_property"]["value"]: item["wiki_property"]["value"] for item in res } internal_wikidata_mapping.update({p: p for p in wiki_properties}) missing_internal_vals = set(properties) - set(internal_wikidata_mapping.keys()) if len(missing_internal_vals) > 0: if raise_missing: raise ValueError( f"Values {missing_internal_vals} are missing from results. To disable this raising an exception, set input raise_missing to False." ) if warn_missing: logger.warning(f"Values {missing_internal_vals} are missing from results.") for val in missing_internal_vals: internal_wikidata_mapping[val] = None return internal_wikidata_mapping
def _get_predicates( self, predicates_ignore: List[str] = [ RDFS.label, OWL.sameAs, SKOS.hasTopConcept, FOAF.title, ], ) -> List[str]: """ Get a unique list of predicates for the table. These will form the columns of X. Args: predicates_ignore (List[str]): predicates to ignore Returns: list of URLs for each predicate, excluding those in `predicates_ignore` """ # TODO: remove this when using pydantic as it will coerce rdflib.term.URIRef to string predicates_ignore = [str(i) for i in predicates_ignore] query = f""" SELECT DISTINCT ?predicate WHERE {{ ?subject <http://www.w3.org/2004/02/skos/core#hasTopConcept> '{self.table_name}'. ?subject ?predicate ?object. }}""" res = get_sparql_results(config.FUSEKI_ENDPOINT, query)["results"]["bindings"] if len(res) > 0: return [ i["predicate"]["value"] for i in res if i["predicate"]["value"] not in predicates_ignore ] else: return []
def test_entitysearch_query(): query = """SELECT DISTINCT ?item ?itemLabel WHERE { ?item wdt:P31/wdt:P279* wd:Q43229. SERVICE wikibase:mwapi { bd:serviceParam wikibase:api "EntitySearch" . bd:serviceParam wikibase:endpoint "www.wikidata.org" . bd:serviceParam mwapi:search "bank" . bd:serviceParam mwapi:language "en" . ?item wikibase:apiOutputItem mwapi:item . ?num wikibase:apiOrdinal true . } SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } }""" res = get_sparql_results(endpoint, query) # more than one result returned assert len(res["results"]["bindings"]) > 0
def get_internal_urls_from_wikidata( url_pattern: str, wikidata_endpoint=config.WIKIDATA_SPARQL_ENDPOINT): """ Get all Wikidata records with property P973 'described at URL' following the pattern in url_pattern. HTTPS is enforced on internal URLs, but Wikidata URLs start with "http://". Args: url_pattern (str): the regex pattern to describe collection URLs. The Science Museum's is 'collection.sciencemuseum.org.uk'. wikidata_endpoint (str, optional): SPARQL endpoint for Wikidata. Returns: pd.DataFrame: columns item (Wikidata URL), itemLabel (label) and internalURL (internal URL) """ query = f""" SELECT DISTINCT ?item ?itemLabel ?internalURL WHERE {{ ?item wdt:P973 ?internalURL filter( regex(str(?internalURL), "{url_pattern}" ) ) SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }} }} """ res = get_sparql_results(wikidata_endpoint, query)["results"]["bindings"] res_df = pd.json_normalize(res) if len(res_df) > 0: res_df = res_df[["item.value", "itemLabel.value", "internalURL.value" ]].rename(columns=lambda x: x.replace(".value", "")) res_df["internalURL"] = res_df["internalURL"].apply( lambda x: x.replace("http://", "https://")) return res_df
def filter_qids_in_class_tree( qids: list, higher_class: Union[str, list], classes_exclude: Union[str, list] = None, include_instanceof: bool = False, ) -> list: """ Returns filtered list of QIDs/PIDs that exist in the class tree below the QID or any of the QIDs defined by `higher_class`. Raises if higher_class is not a valid QID. Args: qids (list): list of QIDs higher_class (Union[str, list]): QID or QIDs of higher class to filter on classes_exclude (Union[str, list]): QID or QIDs of higher classes to exclude. Defaults to None. include_instanceof (bool, optional): whether to include an initial instance of (P31) step in the class tree. Defaults to False. Returns: list: unique list of filtered QIDs """ formatted_qids = join_qids_for_sparql_values_clause(qids) # assume format of each item of qids has already been checked # TODO: what's a good pattern for coordinating this checking so it's not done multiple times? if include_instanceof: class_tree = "wdt:P31/wdt:P279*" else: class_tree = "wdt:P279*" generate_exclude_slug = ( lambda c: f"""MINUS {{?item wdt:P279* wd:{c}. hint:Prior hint:gearing "forward".}}.""" ) if classes_exclude: if isinstance(classes_exclude, str): raise_invalid_qid(classes_exclude) exclude_slug = generate_exclude_slug(classes_exclude) elif isinstance(classes_exclude, list): [raise_invalid_qid(c) for c in classes_exclude] exclude_slug = "\n".join( [generate_exclude_slug(c) for c in classes_exclude] ) else: errors.raise_must_be_str_or_list("classes_exclude") else: exclude_slug = "" if isinstance(higher_class, str): raise_invalid_qid(higher_class) query = f"""SELECT DISTINCT ?item WHERE {{ VALUES ?item {{ {formatted_qids} }} ?item {class_tree} wd:{higher_class}. hint:Prior hint:gearing "forward". {exclude_slug} }}""" elif isinstance(higher_class, list): [raise_invalid_qid(c) for c in higher_class] classes_str = ", ".join(["wd:" + x for x in higher_class]) query = f"""SELECT DISTINCT ?item WHERE {{ VALUES ?item {{ {formatted_qids} }} ?item {class_tree} ?tree. hint:Prior hint:gearing "forward". FILTER (?tree in ({classes_str})) {exclude_slug} }}""" else: errors.raise_must_be_str_or_list("higher_class") res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query) return [ i["item"]["value"].replace("http://www.wikidata.org/entity/", "") for i in res["results"]["bindings"] ]
def get_distance_between_entities( qcode_set: Set[str], bidirectional: bool = False, vertex_pid: str = "P279", reciprocal: bool = False, max_path_length: int = 10, ) -> float: """ Get the length of the shortest path between two entities in `qcode_set`along the 'subclass of' axis. Flag `reciprocal=True` returns 1/(1+l) where l is the length of the shortest path, which can be treated as a similarity measure. Args: qcode_set (Set[str]) bidirectional (bool, optional): If True, paths between entities where the direction is reversed (only once) will be considered. Otherwise only the forward direction specified by the PID in `link_type` will be considered. Defaults to False. vertex_pid (str, optional): this PID specifies the edge types to use for the calculation. reciprocal (bool, optional): Return 1/(1+l), where l is the length of the shortest path. Defaults to False. max_iterations (int, optional): Maximum iterations to look for the shortest path. If the actual shortest path is greater than max_iterations, 10*max_iterations (reciprocal=False) or 1/(1+10*max_iterations) (reciprocal=True) is returned. Returns: float: distance (d <= max_iterations or max_iterations*10) or reciprocal distance (0 < d <= 1) """ if len(qcode_set) == 1: # identity - assume two values have been passed in even though the set will have length 1 return 1 if reciprocal else 0 if len(qcode_set) != 2: raise ValueError("Input variable qcode_set must contain exactly 1 or 2 items") qcodes = [i for i in qcode_set] if (qcodes[0] == "") or (qcodes[1] == ""): # at least one value is empty so return maximum dissimilarity return 0 if reciprocal else 1 else: raise_invalid_qid(qcodes[0]) raise_invalid_qid(qcodes[1]) if bidirectional: query = f"""PREFIX gas: <http://www.bigdata.com/rdf/gas#> SELECT ?super (?aLength + ?bLength as ?length) WHERE {{ SERVICE gas:service {{ gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ; gas:in wd:{qcodes[0]} ; gas:traversalDirection "Forward" ; gas:out ?super ; gas:out1 ?aLength ; gas:maxIterations {max_path_length} ; gas:linkType wdt:{vertex_pid} . }} SERVICE gas:service {{ gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ; gas:in wd:{qcodes[1]} ; gas:traversalDirection "Forward" ; gas:out ?super ; gas:out1 ?bLength ; gas:maxIterations {max_path_length} ; gas:linkType wdt:{vertex_pid} . }} }} ORDER BY ?length LIMIT 1 """ else: # NOTE: two distances are returned in this query to account for the fact that we don't know whether # qcodes[0] or qcodes[1] is higher in the hierarchy, and setting gas:traversalDirection "Undirected" # gives a WDQS error. One of these distances is zero as it's the distance between an entity and itself, # so the max of the two is returned by this function. query = f""" PREFIX gas: <http://www.bigdata.com/rdf/gas#> SELECT ?aLength ?bLength WHERE {{ SERVICE gas:service {{ gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ; gas:in wd:{qcodes[0]} ; gas:traversalDirection "Forward" ; gas:out ?super ; gas:out1 ?aLength ; gas:maxIterations {max_path_length} ; gas:linkType wdt:P279 . }} SERVICE gas:service {{ gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ; gas:in wd:{qcodes[1]} ; gas:traversalDirection "Forward" ; gas:out ?super ; gas:out1 ?bLength ; gas:maxIterations {max_path_length} ; gas:linkType wdt:{vertex_pid} . }} FILTER (?super in (wd:{qcodes[0]}, wd:{qcodes[1]})). }} """ result = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)["results"][ "bindings" ] if len(result) == 0: distance = 10 * max_path_length else: if bidirectional: distance = int(float(result[0]["length"]["value"])) else: distance = int( max( float(result[0]["aLength"]["value"]), float(result[0]["bLength"]["value"]), ) ) return 1 / (1 + distance) if reciprocal else distance
def _run_wikidata_query(self, qcodes: list, instanceof_filter: bool, **kwargs) -> pd.DataFrame: """ Runs a parametrised Wikidata query with options to filter by instance or subclass of a specific property. Returns a dataframe with qcodes matching the filter, their labels and their aliases. Args: qcodes (list): a list of qcodes before filtering instanceof_filter (bool): whether to filter results by instance or subclass of a certain property Kwargs: property_id (str): the property to use as a parameter for the 'instance of' filter include_class_tree (bool): whether to include all subclasses in the search up the tree, or just the instanceof property Returns: pd.DataFrame: columns are qcode, label, alias """ if instanceof_filter: # process kwargs related to the 'instance of' filter try: property_id = kwargs["property_id"] except KeyError: raise ValueError( "Keyword argument property_id (str) must be passed if using instance_of_filter." ) try: include_class_tree = kwargs["include_class_tree"] except KeyError: raise ValueError( "Keyword argument include_class_tree (bool) must be passed if using instance_of_filter." ) # create line of SPARQL query that does filtering class_tree = "/wdt:P279*" if include_class_tree else "" sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id}." else: sparq_instanceof = "" def map_ids(ids): return " ".join([f"(wd:{i})" for i in ids]) query = f""" SELECT ?item ?itemLabel ?altLabel ?birthYear ?deathYear ?inceptionYear ?dissolvedYear WHERE {{ VALUES (?item) {{ {map_ids(qcodes)} }} {sparq_instanceof} OPTIONAL{{ ?item wdt:P569 ?birthDate. BIND( year(?birthDate) AS ?birthYear ) }} OPTIONAL {{ ?item wdt:P570 ?deathDate. BIND( year(?deathDate) AS ?deathYear ) }} OPTIONAL {{ ?item wdt:P571 ?inceptionDate. BIND( year(?inceptionDate) AS ?inceptionYear ) }} OPTIONAL {{ ?item wdt:P576 ?dissolvedDate. BIND( year(?dissolvedDate) AS ?dissolvedYear ) }} OPTIONAL {{ ?item skos:altLabel ?altLabel . FILTER (lang(?altLabel) = "en") }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }} }} """ self.query = query res = get_sparql_results(self.sparql_endpoint_url, query)["results"]["bindings"] res_df = pd.json_normalize(res) res_df.loc[:, "qcode"] = res_df["item.value"].apply( lambda x: re.findall(r"(Q\d+)", x)[0]) # fill missing columns with blanks for any columns that aren't in the data final_cols = [ "qcode", "itemLabel.value", "altLabel.value", "birthYear.value", "deathYear.value", "inceptionYear.value", "dissolvedYear.value", ] cols_missing = set(final_cols) - set(res_df.columns.values.tolist()) for col in cols_missing: res_df[col] = "" res_df = res_df[[ "qcode", "itemLabel.value", "altLabel.value", "birthYear.value", "deathYear.value", "inceptionYear.value", "dissolvedYear.value", ]] # convert aliases to lowercase and fill nan with empty string res_df["altLabel.value"] = ( res_df["altLabel.value"].fillna("").astype(str).str.lower()) res_df = res_df.drop_duplicates() # rename columns (remove .value suffic from year columns) res_df = res_df.rename(columns={ "itemLabel.value": "label", "altLabel.value": "alias" }) res_df = res_df.rename(columns=lambda x: x.replace(".value", "")) self.sparql_res = res_df return res_df
def run_search(self, text: str, limit=100, **kwargs) -> pd.DataFrame: """ Run Wikidata search. Args: text (str): text to search limit (int, optional): Defaults to 100. Kwargs: instanceof_filter (str/list): property or properties to filter values by instance of. include_class_tree (bool): whether to look in the subclass tree for the instance of filter. property_filters (dict): filters on exact values of properties you want to pass through. {property: value, ...} Returns: pd.DataFrame: columns rank, item, itemLabel, score """ class_tree = "/wdt:P279*" if "include_class_tree" in kwargs else "" sparq_property_filter = "" if "instanceof_filter" in kwargs: property_id = kwargs["instanceof_filter"] if isinstance(property_id, str): # one instanceof in filter sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id}." elif isinstance(property_id, list): if len(property_id) == 1: sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id[0]}." else: ids = ", ".join(["wd:" + x for x in property_id]) sparq_instanceof = f" ?item wdt:P31{class_tree} ?tree. \n FILTER (?tree in ({ids}))" else: sparq_instanceof = "" if "property_filters" in kwargs: for prop, value in kwargs["property_filters"].items(): if isinstance(value, str): assert_qid_format(value) sparq_property_filter += f"\n ?item wdt:{prop} wd:{value} ." elif isinstance(value, list): if len(value) == 1: assert_qid_format(value[0]) sparq_property_filter += f"\n ?item wdt:{prop} wd:{value[0]} ." else: ids = ", ".join(["wd:" + x for x in value]) sparq_instanceof = ( f" ?item wdt:{prop} ?tree. \n FILTER (?tree in ({ids}))" ) endpoint_url = config.WIKIDATA_SPARQL_ENDPOINT query = f""" SELECT DISTINCT ?item ?itemLabel WHERE {{ {sparq_instanceof} {sparq_property_filter} SERVICE wikibase:mwapi {{ bd:serviceParam wikibase:api "EntitySearch" . bd:serviceParam wikibase:endpoint "www.wikidata.org" . bd:serviceParam mwapi:search "{text}" . bd:serviceParam mwapi:language "en" . ?item wikibase:apiOutputItem mwapi:item . ?num wikibase:apiOrdinal true . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }} }} LIMIT {limit} """ res = get_sparql_results(endpoint_url, query)["results"]["bindings"] res_df = pd.json_normalize(res) if len(res_df) > 0: res_df = res_df[[ "item.value", "itemLabel.value" ]].rename(columns=lambda x: x.replace(".value", "")) res_df = res_df.reset_index().rename(columns={"index": "rank"}) res_df["rank"] = res_df["rank"] + 1 res_df = self.add_score_to_search_results_df(res_df, rank_col="rank") return res_df
def run_search(self, text: str, limit=100, similarity_thresh=50, **kwargs): """ Run Wikipedia search, then rank and limit results based on string similarity. Args: text (str): text to search limit (int, optional): Defaults to 100. similarity_thresh (int, optional): The text similarity cut-off to exclude items from search results. Defaults to 50. Kwargs: instanceof_filter (str/list): property or properties to filter values by instance of. include_class_tree (bool): whether to look in the subclass tree for the instance of filter. property_filters (dict): filters on exact values of properties you want to pass through. {property: value, ...} Returns: pd.DataFrame: columns rank, item, itemLabel, score """ class_tree = "/wdt:P279*" if "include_class_tree" in kwargs else "" sparq_property_filter = "" if "instanceof_filter" in kwargs: property_id = kwargs["instanceof_filter"] if isinstance(property_id, str): # one instanceof in filter sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id}." elif isinstance(property_id, list): if len(property_id) == 1: sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id[0]}." else: ids = ", ".join(["wd:" + x for x in property_id]) sparq_instanceof = f" ?item wdt:P31{class_tree} ?tree. \n FILTER (?tree in ({ids}))" else: sparq_instanceof = "" if "property_filters" in kwargs: for prop, value in kwargs["property_filters"].items(): if isinstance(value, str): assert_qid_format(value) sparq_property_filter += f"\n ?item wdt:{prop} wd:{value} ." elif isinstance(value, list): if len(value) == 1: assert_qid_format(value[0]) sparq_property_filter += f"\n ?item wdt:{prop} wd:{value[0]} ." else: ids = ", ".join(["wd:" + x for x in value]) sparq_instanceof = ( f" ?item wdt:{prop} ?tree. \n FILTER (?tree in ({ids}))" ) endpoint_url = config.WIKIDATA_SPARQL_ENDPOINT query = f""" SELECT ?item ?wikipedia_title {{ SERVICE wikibase:mwapi {{ bd:serviceParam wikibase:endpoint "en.wikipedia.org" . bd:serviceParam wikibase:api "Generator" . bd:serviceParam mwapi:generator "search" . bd:serviceParam mwapi:gsrsearch "{text}" . bd:serviceParam mwapi:gsrlimit "max" . ?item wikibase:apiOutputItem mwapi:item . ?wikipedia_title wikibase:apiOutput mwapi:title . }} hint:Prior hint:runFirst "true". {sparq_instanceof} {sparq_property_filter} }} LIMIT {limit} """ res = get_sparql_results(endpoint_url, query)["results"]["bindings"] res_df = pd.json_normalize(res) if len(res_df) > 0: res_df = res_df[[ "item.value", "wikipedia_title.value" ]].rename(columns=lambda x: x.replace(".value", "")) res_df["text_similarity"] = res_df["wikipedia_title"].apply( lambda s: self.calculate_label_similarity(text, s)) res_df = (res_df[ res_df["text_similarity"] >= similarity_thresh].sort_values( "text_similarity", ascending=False).reset_index(drop=True)) res_df = res_df.drop(columns="text_similarity") res_df = res_df.reset_index().rename(columns={"index": "rank"}) res_df["rank"] = res_df["rank"] + 1 res_df = self.add_score_to_search_results_df(res_df, rank_col="rank") return res_df