Esempi in Python per get_sparql_results, esempi in Python per heritageconnector.utils.sparql.get_sparql_results

Esempio n. 1

0

Mostra file

def get_sameas_links_from_external_id(pid: str,
                                      formatter_url: str = None
                                      ) -> pd.DataFrame:
    """
    Get sameAs links between Wikidata and another database using its external identifier PID.

    Args:
        pid (str): PID for an external identifier
        formatter_url (str, optional): URL to map IDs to full URLs, with $1 in place of the ID,
            e.g. "https://collection.sciencemuseum.org.uk/$1".

    Returns:
        pd.DataFrame: columns wikidata_url, external_url
    """

    if formatter_url is None:
        # get formatter URL
        query = f"""SELECT * WHERE {{
        wd:{pid} wdt:P1630 ?url  
        }}
        """
        res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT,
                                 query)["results"]["bindings"]

        if len(res) == 0:
            raise ValueError(
                "No formatter URL found. Specify it in the `formatter_url` argument to this function instead."
            )

        formatter_url = res[0]["url"]["value"]

    elif "$1" not in formatter_url:
        raise ValueError(
            "Argument formatter_url must contain $1, describing where the ID appears."
        )

    # get wikidata urls and internal IDs for PID
    query = f"""SELECT * WHERE {{
    ?wiki_url wdt:{pid} ?external_id .
    }}"""

    res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT,
                             query)["results"]["bindings"]
    if len(res) > 0:
        res_df = pd.json_normalize(res)[[
            "wiki_url.value", "external_id.value"
        ]].rename(
            columns={
                "wiki_url.value": "wikidata_url",
                "external_id.value": "external_url",
            })
        res_df["external_url"] = res_df["external_url"].apply(
            lambda i: formatter_url.replace("$1", i))

        return res_df

Esempio n. 2

0

Mostra file

File: pipelines.py Progetto: TheScienceMuseum/heritage-connector

    def _get_unlabelled_records_from_sparql_store(self,
                                                  limit: int = None
                                                  ) -> Iterable[dict]:
        """
        Get all records without an owl:sameAs value (URIs and labels) from the Fuseki instance.

        Args:
            limit (int, optional): Defaults to None.

        Returns:
            Generator of dicts. Each dict has the form {"id": __, "label": ___}
        """

        query = f"""SELECT DISTINCT ?item ?itemLabel WHERE {{
            FILTER NOT EXISTS {{?item owl:sameAs ?object}}.
            ?item rdfs:label ?itemLabel.
            {self._get_type_constraint()}
            {self.extra_sparql_lines}
            ?item skos:hasTopConcept '{self.table_name}'.
        }}"""

        if limit is not None:
            query = query + f"LIMIT {limit}"

        res = get_sparql_results(config.FUSEKI_ENDPOINT,
                                 query)["results"]["bindings"]

        return ({
            "id": item["item"]["value"],
            "label": item["itemLabel"]["value"]
        } for item in res)

Esempio n. 3

0

Mostra file

File: wikidata_test.py Progetto: TheScienceMuseum/heritage-connector

def test_propertylookup_query():
    query = """
    SELECT ?item ?itemLabel ?itemDescription ?altLabel ?P570Label ?P569Label
    WHERE {
        VALUES (?item) { (wd:Q106481) (wd:Q46633) }
        OPTIONAL{ ?item wdt:P570 ?P570 .}
        OPTIONAL{ ?item wdt:P569 ?P569 .}

        OPTIONAL {
        ?item skos:altLabel ?altLabel .
        FILTER (lang(?altLabel) = "en")
        }

        SERVICE wikibase:label { 
        bd:serviceParam wikibase:language "en" .
        }
    }
    """

    res = get_sparql_results(endpoint, query)

    # one result for each entity
    assert len(res["results"]["bindings"]) == 2

    # one column for each value in the SELECT slug
    assert len(res["results"]["bindings"][0]) == 6

Esempio n. 4

0

Mostra file

File: wikidata_test.py Progetto: TheScienceMuseum/heritage-connector

def test_shortestpath_query():
    query = """PREFIX gas: <http://www.bigdata.com/rdf/gas#>

    SELECT ?super (?aLength + ?bLength as ?length) WHERE {
    SERVICE gas:service {
        gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                    gas:in wd:Q22687 ;
                    gas:traversalDirection "Forward" ;
                    gas:out ?super ;
                    gas:out1 ?aLength ;
                    gas:maxIterations 10 ;
                    gas:linkType wdt:P279 .
    }
    SERVICE gas:service {
        gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                    gas:in wd:Q43229 ;
                    gas:traversalDirection "Forward" ;
                    gas:out ?super ;
                    gas:out1 ?bLength ;
                    gas:maxIterations 10 ;
                    gas:linkType wdt:P279 .
    }  
    } ORDER BY ?length
    LIMIT 1"""

    res = get_sparql_results(endpoint, query)
    assert int(float(res["results"]["bindings"][0]["length"]["value"])) == 2

Esempio n. 5

0

Mostra file

File: test_utils.py Progetto: TheScienceMuseum/heritage-connector

    def test_get_sparql_results(self):
        endpoint = "https://query.wikidata.org/sparql"
        query = "SELECT ?s ?p ?o WHERE {?s ?p ?o} LIMIT 20"
        res = sparql.get_sparql_results(endpoint, query, add_prefixes=True)

        assert "results" in res
        assert "bindings" in res["results"]

Esempio n. 6

0

Mostra file

    def lookup_wikidata_id(self, pid: str, uid: str) -> str:
        """
        Lookup UID on Wikidata against given property ID of source

        Args:
            pid (str): Property ID of source (e.g. OxDnB ID: P1415)
            uid (str): Value of source ID (e.g. an OxDnB ID: 23105)

        Returns:
            qcode: Wikidata qcode in format Q(d+)
        """

        endpoint_url = config.WIKIDATA_SPARQL_ENDPOINT

        query = f"""
            SELECT ?item ?itemLabel WHERE {{
                ?item wdt:{pid} "{uid}".
                SERVICE wikibase:label {{
                    bd:serviceParam wikibase:language "en" .
                }}
            }}
        """

        res = get_sparql_results(endpoint_url, query)

        if res:
            wikidata = res["results"]["bindings"]
            if wikidata and wikidata[0]:
                wikidata_url = wikidata[0]["item"]["value"]
                wikidata_id = re.findall(r"(Q\d+)", wikidata_url)[0]

                return wikidata_id

Esempio n. 7

0

Mostra file

File: reconciler.py Progetto: TheScienceMuseum/heritage-connector

    def _get_subject_items_from_pid(pid: str) -> list:
        """
        Gets a list of subject items from a Wikidata property ID using 'subject
        item of this property (P1629)'. If a URL is passed extracts the PID from
        it if it exists, else raises a ValueError.
        """

        if pid.startswith("http"):
            logger.warning(
                "WARNING: URL instead of PID entered. Converting to PID")
            pids = re.findall(r"(P\d+)", pid)

            if len(pids) == 1:
                pid = pids[0]
            else:
                raise ValueError("URL not a valid property URL.")

        query = f"""
        SELECT ?property WHERE {{
        wd:{pid} wdt:P1629 ?property.
        }}
        """

        res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)

        if "results" in res:
            bindings = res["results"]["bindings"]
            qids = [url_to_qid(item["property"]["value"]) for item in bindings]

            return qids
        else:
            return []

Esempio n. 8

0

Mostra file

def get_wikidata_equivalents_for_properties(
    properties: List[str], raise_missing=False, warn_missing=True
) -> dict:
    """
    Get Wikidata equivalents for RDF properties.

    Args:
        properties (List[str]): list of URIs of properties
        raise_missing (bool, optional): If True, raises a ValueError if Wikidata equivalents
            can't be found for any of the specified properties. Defaults to False.
        warn_missing (bool, optional): If True, logs a warning if Wikidata equivalents
            can't be found for any of the specified properties. Defaults to True.

    Returns:
        dict: {property: wikidata_value, ...}. Any properties that don't have a corresponding
            Wikidata value will have value None unless they are already a Wikidata property value,
            in which case their key is the same as their value.
    """

    wiki_properties = [p for p in properties if p.startswith(str(WDT))]
    lookup_properties = list(set(properties) - set(wiki_properties))

    values_slug = " ".join(["<" + uri + ">" for uri in lookup_properties])

    query = f"""SELECT * WHERE {{
    VALUES ?internal_property {{ {values_slug} }}.
    ?wiki_property wdt:P1628 ?internal_property.
    }}"""

    res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)["results"][
        "bindings"
    ]

    internal_wikidata_mapping = {
        item["internal_property"]["value"]: item["wiki_property"]["value"]
        for item in res
    }
    internal_wikidata_mapping.update({p: p for p in wiki_properties})

    missing_internal_vals = set(properties) - set(internal_wikidata_mapping.keys())

    if len(missing_internal_vals) > 0:
        if raise_missing:
            raise ValueError(
                f"Values {missing_internal_vals} are missing from results. To disable this raising an exception, set input raise_missing to False."
            )

        if warn_missing:
            logger.warning(f"Values {missing_internal_vals} are missing from results.")

    for val in missing_internal_vals:
        internal_wikidata_mapping[val] = None

    return internal_wikidata_mapping

Esempio n. 9

0

Mostra file

File: pipelines.py Progetto: TheScienceMuseum/heritage-connector

    def _get_predicates(
        self,
        predicates_ignore: List[str] = [
            RDFS.label,
            OWL.sameAs,
            SKOS.hasTopConcept,
            FOAF.title,
        ],
    ) -> List[str]:
        """
        Get a unique list of predicates for the table. These will form the columns of X.

        Args:
            predicates_ignore (List[str]): predicates to ignore

        Returns:
            list of URLs for each predicate, excluding those in `predicates_ignore`
        """

        # TODO: remove this when using pydantic as it will coerce rdflib.term.URIRef to string
        predicates_ignore = [str(i) for i in predicates_ignore]

        query = f"""
        SELECT DISTINCT ?predicate
        WHERE {{
        ?subject <http://www.w3.org/2004/02/skos/core#hasTopConcept> '{self.table_name}'.
        ?subject ?predicate ?object.
        }}"""

        res = get_sparql_results(config.FUSEKI_ENDPOINT,
                                 query)["results"]["bindings"]

        if len(res) > 0:
            return [
                i["predicate"]["value"] for i in res
                if i["predicate"]["value"] not in predicates_ignore
            ]

        else:
            return []

Esempio n. 10

0

Mostra file

File: wikidata_test.py Progetto: TheScienceMuseum/heritage-connector

def test_entitysearch_query():
    query = """SELECT DISTINCT ?item ?itemLabel 
    WHERE
    {
        ?item wdt:P31/wdt:P279* wd:Q43229.
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:api "EntitySearch" .
            bd:serviceParam wikibase:endpoint "www.wikidata.org" .
            bd:serviceParam mwapi:search "bank" .
            bd:serviceParam mwapi:language "en" .
            ?item wikibase:apiOutputItem mwapi:item .
            ?num wikibase:apiOrdinal true .
            }

        SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en" .
        }
    }"""

    res = get_sparql_results(endpoint, query)

    # more than one result returned
    assert len(res["results"]["bindings"]) > 0

Esempio n. 11

0

Mostra file

def get_internal_urls_from_wikidata(
        url_pattern: str, wikidata_endpoint=config.WIKIDATA_SPARQL_ENDPOINT):
    """
    Get all Wikidata records with property P973 'described at URL' following the pattern in url_pattern. HTTPS is enforced on internal URLs,
    but Wikidata URLs start with "http://".

    Args:
        url_pattern (str): the regex pattern to describe collection URLs. The Science Museum's is 'collection.sciencemuseum.org.uk'.
        wikidata_endpoint (str, optional): SPARQL endpoint for Wikidata.

    Returns:
        pd.DataFrame: columns item (Wikidata URL), itemLabel (label) and internalURL (internal URL)
    """

    query = f"""
        SELECT DISTINCT ?item ?itemLabel ?internalURL WHERE {{
            ?item wdt:P973 ?internalURL

            filter( regex(str(?internalURL), "{url_pattern}" ) )

            SERVICE wikibase:label {{
            bd:serviceParam wikibase:language "en" .
            }}
        }}     
    """

    res = get_sparql_results(wikidata_endpoint, query)["results"]["bindings"]
    res_df = pd.json_normalize(res)

    if len(res_df) > 0:
        res_df = res_df[["item.value", "itemLabel.value", "internalURL.value"
                         ]].rename(columns=lambda x: x.replace(".value", ""))

        res_df["internalURL"] = res_df["internalURL"].apply(
            lambda x: x.replace("http://", "https://"))

    return res_df

Esempio n. 12

0

Mostra file

def filter_qids_in_class_tree(
    qids: list,
    higher_class: Union[str, list],
    classes_exclude: Union[str, list] = None,
    include_instanceof: bool = False,
) -> list:
    """
    Returns filtered list of QIDs/PIDs that exist in the class tree below the QID or any of
    the QIDs defined by `higher_class`. Raises if higher_class is not a valid QID.

    Args:
        qids (list): list of QIDs
        higher_class (Union[str, list]): QID or QIDs of higher class to filter on
        classes_exclude (Union[str, list]): QID or QIDs of higher classes to exclude. Defaults to None.
        include_instanceof (bool, optional): whether to include an initial instance of (P31) step in the class tree.
            Defaults to False.

    Returns:
        list: unique list of filtered QIDs
    """

    formatted_qids = join_qids_for_sparql_values_clause(qids)

    # assume format of each item of qids has already been checked
    # TODO: what's a good pattern for coordinating this checking so it's not done multiple times?

    if include_instanceof:
        class_tree = "wdt:P31/wdt:P279*"
    else:
        class_tree = "wdt:P279*"

    generate_exclude_slug = (
        lambda c: f"""MINUS {{?item wdt:P279* wd:{c}. hint:Prior hint:gearing "forward".}}."""
    )

    if classes_exclude:
        if isinstance(classes_exclude, str):
            raise_invalid_qid(classes_exclude)
            exclude_slug = generate_exclude_slug(classes_exclude)

        elif isinstance(classes_exclude, list):
            [raise_invalid_qid(c) for c in classes_exclude]
            exclude_slug = "\n".join(
                [generate_exclude_slug(c) for c in classes_exclude]
            )

        else:
            errors.raise_must_be_str_or_list("classes_exclude")

    else:
        exclude_slug = ""

    if isinstance(higher_class, str):
        raise_invalid_qid(higher_class)

        query = f"""SELECT DISTINCT ?item WHERE {{
        VALUES ?item {{ {formatted_qids} }}
        ?item {class_tree} wd:{higher_class}.
        hint:Prior hint:gearing "forward".
        {exclude_slug}
        }}"""

    elif isinstance(higher_class, list):
        [raise_invalid_qid(c) for c in higher_class]
        classes_str = ", ".join(["wd:" + x for x in higher_class])

        query = f"""SELECT DISTINCT ?item WHERE {{
        VALUES ?item {{ {formatted_qids} }}
        ?item {class_tree} ?tree.
        hint:Prior hint:gearing "forward".
        FILTER (?tree in ({classes_str}))
        {exclude_slug}
        }}"""

    else:
        errors.raise_must_be_str_or_list("higher_class")

    res = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)

    return [
        i["item"]["value"].replace("http://www.wikidata.org/entity/", "")
        for i in res["results"]["bindings"]
    ]

Esempio n. 13

0

Mostra file

def get_distance_between_entities(
    qcode_set: Set[str],
    bidirectional: bool = False,
    vertex_pid: str = "P279",
    reciprocal: bool = False,
    max_path_length: int = 10,
) -> float:
    """
    Get the length of the shortest path between two entities in `qcode_set`along the 'subclass of' axis. 
    Flag `reciprocal=True` returns 1/(1+l) where l is the length of the shortest path, which can be treated as a similarity measure.

    Args:
        qcode_set (Set[str])
        bidirectional (bool, optional): If True, paths between entities where the direction is reversed (only once) will be considered. 
            Otherwise only the forward direction specified by the PID in `link_type` will be considered. Defaults to False.
        vertex_pid (str, optional): this PID specifies the edge types to use for the calculation.
        reciprocal (bool, optional): Return 1/(1+l), where l is the length of the shortest path. Defaults to False.
        max_iterations (int, optional): Maximum iterations to look for the shortest path. If the actual shortest path is
            greater than max_iterations, 10*max_iterations (reciprocal=False) or 1/(1+10*max_iterations) (reciprocal=True) is returned.

    Returns:
        float: distance (d <= max_iterations or max_iterations*10) or reciprocal distance (0 < d <= 1)
    """

    if len(qcode_set) == 1:
        # identity - assume two values have been passed in even though the set will have length 1
        return 1 if reciprocal else 0

    if len(qcode_set) != 2:
        raise ValueError("Input variable qcode_set must contain exactly 1 or 2 items")

    qcodes = [i for i in qcode_set]

    if (qcodes[0] == "") or (qcodes[1] == ""):
        # at least one value is empty so return maximum dissimilarity
        return 0 if reciprocal else 1
    else:
        raise_invalid_qid(qcodes[0])
        raise_invalid_qid(qcodes[1])

    if bidirectional:
        query = f"""PREFIX gas: <http://www.bigdata.com/rdf/gas#>

        SELECT ?super (?aLength + ?bLength as ?length) WHERE {{
        SERVICE gas:service {{
            gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                        gas:in wd:{qcodes[0]} ;
                        gas:traversalDirection "Forward" ;
                        gas:out ?super ;
                        gas:out1 ?aLength ;
                        gas:maxIterations {max_path_length} ;
                        gas:linkType wdt:{vertex_pid} .
        }}
        SERVICE gas:service {{
            gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                        gas:in wd:{qcodes[1]} ;
                        gas:traversalDirection "Forward" ;
                        gas:out ?super ;
                        gas:out1 ?bLength ;
                        gas:maxIterations {max_path_length} ;
                        gas:linkType wdt:{vertex_pid} .
        }}  
        }} ORDER BY ?length
        LIMIT 1
        """
    else:
        # NOTE: two distances are returned in this query to account for the fact that we don't know whether
        # qcodes[0] or qcodes[1] is higher in the hierarchy, and setting gas:traversalDirection "Undirected"
        # gives a WDQS error. One of these distances is zero as it's the distance between an entity and itself,
        # so the max of the two is returned by this function.

        query = f"""
        PREFIX gas: <http://www.bigdata.com/rdf/gas#>

        SELECT ?aLength ?bLength WHERE {{
        SERVICE gas:service {{
            gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                        gas:in wd:{qcodes[0]} ;
            gas:traversalDirection "Forward" ;
                                gas:out ?super ;
                                gas:out1 ?aLength ;
                                gas:maxIterations {max_path_length} ;
            gas:linkType wdt:P279 .
        }}

        SERVICE gas:service {{
            gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                        gas:in wd:{qcodes[1]} ;
            gas:traversalDirection "Forward" ;
                                gas:out ?super ;
                                gas:out1 ?bLength ;
                                gas:maxIterations {max_path_length} ;
            gas:linkType wdt:{vertex_pid} .
        }} 
        FILTER (?super in (wd:{qcodes[0]}, wd:{qcodes[1]})).
        }}
        """

    result = get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)["results"][
        "bindings"
    ]

    if len(result) == 0:
        distance = 10 * max_path_length
    else:
        if bidirectional:
            distance = int(float(result[0]["length"]["value"]))
        else:
            distance = int(
                max(
                    float(result[0]["aLength"]["value"]),
                    float(result[0]["bLength"]["value"]),
                )
            )

    return 1 / (1 + distance) if reciprocal else distance

Esempio n. 14

0

Mostra file

    def _run_wikidata_query(self, qcodes: list, instanceof_filter: bool,
                            **kwargs) -> pd.DataFrame:
        """
        Runs a parametrised Wikidata query with options to filter by instance or subclass of a specific property.
        Returns a dataframe with qcodes matching the filter, their labels and their aliases. 

        Args:
            qcodes (list): a list of qcodes before filtering
            instanceof_filter (bool): whether to filter results by instance or subclass of a certain property

        Kwargs:
            property_id (str): the property to use as a parameter for the 'instance of' filter
            include_class_tree (bool): whether to include all subclasses in the search up the tree, or just the instanceof property

        Returns:
            pd.DataFrame: columns are qcode, label, alias
        """

        if instanceof_filter:
            # process kwargs related to the 'instance of' filter
            try:
                property_id = kwargs["property_id"]
            except KeyError:
                raise ValueError(
                    "Keyword argument property_id (str) must be passed if using instance_of_filter."
                )

            try:
                include_class_tree = kwargs["include_class_tree"]
            except KeyError:
                raise ValueError(
                    "Keyword argument include_class_tree (bool) must be passed if using instance_of_filter."
                )

            # create line of SPARQL query that does filtering
            class_tree = "/wdt:P279*" if include_class_tree else ""
            sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id}."

        else:
            sparq_instanceof = ""

        def map_ids(ids):
            return " ".join([f"(wd:{i})" for i in ids])

        query = f"""
        SELECT ?item ?itemLabel ?altLabel ?birthYear ?deathYear ?inceptionYear ?dissolvedYear
                WHERE
                {{
                    VALUES (?item) {{ {map_ids(qcodes)} }}
                    {sparq_instanceof}
                    OPTIONAL{{
                        ?item wdt:P569 ?birthDate.
                        BIND( year(?birthDate) AS ?birthYear )
                        }}
                    OPTIONAL {{
                        ?item wdt:P570 ?deathDate.
                        BIND( year(?deathDate) AS ?deathYear )
                        }}
                    OPTIONAL {{
                        ?item wdt:P571 ?inceptionDate.
                        BIND( year(?inceptionDate) AS ?inceptionYear )
                        }}
                    OPTIONAL {{
                        ?item wdt:P576 ?dissolvedDate.
                        BIND( year(?dissolvedDate) AS ?dissolvedYear )  
                        }}
                    OPTIONAL {{
                        ?item skos:altLabel ?altLabel .
                        FILTER (lang(?altLabel) = "en")
                        }}

                    SERVICE wikibase:label {{ 
                    bd:serviceParam wikibase:language "en" .
                    }}
                }}
        """
        self.query = query
        res = get_sparql_results(self.sparql_endpoint_url,
                                 query)["results"]["bindings"]

        res_df = pd.json_normalize(res)
        res_df.loc[:, "qcode"] = res_df["item.value"].apply(
            lambda x: re.findall(r"(Q\d+)", x)[0])

        # fill missing columns with blanks for any columns that aren't in the data
        final_cols = [
            "qcode",
            "itemLabel.value",
            "altLabel.value",
            "birthYear.value",
            "deathYear.value",
            "inceptionYear.value",
            "dissolvedYear.value",
        ]
        cols_missing = set(final_cols) - set(res_df.columns.values.tolist())
        for col in cols_missing:
            res_df[col] = ""

        res_df = res_df[[
            "qcode",
            "itemLabel.value",
            "altLabel.value",
            "birthYear.value",
            "deathYear.value",
            "inceptionYear.value",
            "dissolvedYear.value",
        ]]

        # convert aliases to lowercase and fill nan with empty string
        res_df["altLabel.value"] = (
            res_df["altLabel.value"].fillna("").astype(str).str.lower())

        res_df = res_df.drop_duplicates()

        # rename columns (remove .value suffic from year columns)
        res_df = res_df.rename(columns={
            "itemLabel.value": "label",
            "altLabel.value": "alias"
        })
        res_df = res_df.rename(columns=lambda x: x.replace(".value", ""))
        self.sparql_res = res_df
        return res_df

Esempio n. 15

0

Mostra file

File: search.py Progetto: TheScienceMuseum/heritage-connector

    def run_search(self, text: str, limit=100, **kwargs) -> pd.DataFrame:
        """
        Run Wikidata search.

        Args:
            text (str): text to search
            limit (int, optional): Defaults to 100.

        Kwargs:
            instanceof_filter (str/list): property or properties to filter values by instance of.
            include_class_tree (bool): whether to look in the subclass tree for the instance of filter.
            property_filters (dict): filters on exact values of properties you want to pass through. {property: value, ...}

        Returns:
            pd.DataFrame: columns rank, item, itemLabel, score
        """

        class_tree = "/wdt:P279*" if "include_class_tree" in kwargs else ""
        sparq_property_filter = ""

        if "instanceof_filter" in kwargs:
            property_id = kwargs["instanceof_filter"]
            if isinstance(property_id, str):
                # one instanceof in filter
                sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id}."
            elif isinstance(property_id, list):
                if len(property_id) == 1:
                    sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id[0]}."
                else:
                    ids = ", ".join(["wd:" + x for x in property_id])
                    sparq_instanceof = f" ?item wdt:P31{class_tree} ?tree. \n FILTER (?tree in ({ids}))"

        else:
            sparq_instanceof = ""

        if "property_filters" in kwargs:
            for prop, value in kwargs["property_filters"].items():
                if isinstance(value, str):
                    assert_qid_format(value)
                    sparq_property_filter += f"\n ?item wdt:{prop} wd:{value} ."
                elif isinstance(value, list):
                    if len(value) == 1:
                        assert_qid_format(value[0])
                        sparq_property_filter += f"\n ?item wdt:{prop} wd:{value[0]} ."
                    else:
                        ids = ", ".join(["wd:" + x for x in value])
                        sparq_instanceof = (
                            f" ?item wdt:{prop} ?tree. \n FILTER (?tree in ({ids}))"
                        )

        endpoint_url = config.WIKIDATA_SPARQL_ENDPOINT
        query = f"""
        SELECT DISTINCT ?item ?itemLabel 
        WHERE
        {{
            {sparq_instanceof}
            {sparq_property_filter} 
            SERVICE wikibase:mwapi {{
                bd:serviceParam wikibase:api "EntitySearch" .
                bd:serviceParam wikibase:endpoint "www.wikidata.org" .
                bd:serviceParam mwapi:search "{text}" .
                bd:serviceParam mwapi:language "en" .
                ?item wikibase:apiOutputItem mwapi:item .
                ?num wikibase:apiOrdinal true .
              }}

            SERVICE wikibase:label {{
            bd:serviceParam wikibase:language "en" .
            }}
        }} LIMIT {limit}
        """

        res = get_sparql_results(endpoint_url, query)["results"]["bindings"]

        res_df = pd.json_normalize(res)

        if len(res_df) > 0:
            res_df = res_df[[
                "item.value", "itemLabel.value"
            ]].rename(columns=lambda x: x.replace(".value", ""))

            res_df = res_df.reset_index().rename(columns={"index": "rank"})
            res_df["rank"] = res_df["rank"] + 1

            res_df = self.add_score_to_search_results_df(res_df,
                                                         rank_col="rank")

        return res_df

Esempio n. 16

0

Mostra file

File: search.py Progetto: TheScienceMuseum/heritage-connector

    def run_search(self, text: str, limit=100, similarity_thresh=50, **kwargs):
        """
        Run Wikipedia search, then rank and limit results based on string similarity.

        Args:
            text (str): text to search
            limit (int, optional): Defaults to 100.
            similarity_thresh (int, optional): The text similarity cut-off to exclude items from search results. Defaults to 50.

        Kwargs:
            instanceof_filter (str/list): property or properties to filter values by instance of.
            include_class_tree (bool): whether to look in the subclass tree for the instance of filter.
            property_filters (dict): filters on exact values of properties you want to pass through. {property: value, ...}

        Returns:
            pd.DataFrame: columns rank, item, itemLabel, score
        """

        class_tree = "/wdt:P279*" if "include_class_tree" in kwargs else ""
        sparq_property_filter = ""

        if "instanceof_filter" in kwargs:
            property_id = kwargs["instanceof_filter"]
            if isinstance(property_id, str):
                # one instanceof in filter
                sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id}."
            elif isinstance(property_id, list):
                if len(property_id) == 1:
                    sparq_instanceof = f"?item wdt:P31{class_tree} wd:{property_id[0]}."
                else:
                    ids = ", ".join(["wd:" + x for x in property_id])
                    sparq_instanceof = f" ?item wdt:P31{class_tree} ?tree. \n FILTER (?tree in ({ids}))"

        else:
            sparq_instanceof = ""

        if "property_filters" in kwargs:
            for prop, value in kwargs["property_filters"].items():
                if isinstance(value, str):
                    assert_qid_format(value)
                    sparq_property_filter += f"\n ?item wdt:{prop} wd:{value} ."
                elif isinstance(value, list):
                    if len(value) == 1:
                        assert_qid_format(value[0])
                        sparq_property_filter += f"\n ?item wdt:{prop} wd:{value[0]} ."
                    else:
                        ids = ", ".join(["wd:" + x for x in value])
                        sparq_instanceof = (
                            f" ?item wdt:{prop} ?tree. \n FILTER (?tree in ({ids}))"
                        )

        endpoint_url = config.WIKIDATA_SPARQL_ENDPOINT
        query = f"""
        SELECT ?item ?wikipedia_title {{
            SERVICE wikibase:mwapi {{
                bd:serviceParam wikibase:endpoint "en.wikipedia.org" .
                bd:serviceParam wikibase:api "Generator" .
                bd:serviceParam mwapi:generator "search" .
                bd:serviceParam mwapi:gsrsearch "{text}" .
                bd:serviceParam mwapi:gsrlimit "max" .
                ?item wikibase:apiOutputItem mwapi:item . 
                ?wikipedia_title wikibase:apiOutput mwapi:title .
            }}
            hint:Prior hint:runFirst "true".
            {sparq_instanceof}
            {sparq_property_filter}
        }} LIMIT {limit}
        """

        res = get_sparql_results(endpoint_url, query)["results"]["bindings"]

        res_df = pd.json_normalize(res)

        if len(res_df) > 0:
            res_df = res_df[[
                "item.value", "wikipedia_title.value"
            ]].rename(columns=lambda x: x.replace(".value", ""))

            res_df["text_similarity"] = res_df["wikipedia_title"].apply(
                lambda s: self.calculate_label_similarity(text, s))
            res_df = (res_df[
                res_df["text_similarity"] >= similarity_thresh].sort_values(
                    "text_similarity", ascending=False).reset_index(drop=True))
            res_df = res_df.drop(columns="text_similarity")
            res_df = res_df.reset_index().rename(columns={"index": "rank"})
            res_df["rank"] = res_df["rank"] + 1

            res_df = self.add_score_to_search_results_df(res_df,
                                                         rank_col="rank")

        return res_df