def get_label_uri(uri):
    # return 'debug'
    r = query_sparql('select ?l where {' + uri +
                     ' rdfs:label ?l . FILTER(langMatches(lang(?l),"en")) }')
    if len(r) > 0:
        return r[0]['l']['value']
    else:
        r = query_sparql(
            'select ?l where {' + uri +
            ' foaf:name ?l . FILTER(langMatches(lang(?l),"en")) }')
        if len(r) > 0:
            return r[0]['l']['value']
        else:
            raise ValueError("cannot find uri label")
def identify_city_uri_attribute(uri, attribute):
    """Retrieves the uris of the cities associated with a dbpedia page and an attribute contained in that page

       We simply consider city everything that is not a country.
       So we:
            1) Start from a page
            2) Retrieve all the pages that page disambiguates or redirect, recursively
            3) Filter out state and redirect and disambiguate pages


    Arguments:
        uri {str} --
        attribute {str} --

    Returns:
        list -- uris of the pages of the country
    """
    if ask_sparql("ask { " + uri + " " + attribute + "  ?s }"):
        query = "select distinct ?f where{\
                    " + uri + " " + attribute + "  ?s .\
                    ?s (dbo:wikiPageRedirects|dbo:wikiPageDisambiguates)*  ?f .\
                    filter not exists { ?f a dbo:Country } .\
                    filter not exists { ?f dct:subject dbc:States_of_the_United_States } .\
                    filter not exists { ?f dbo:wikiPageDisambiguates ?w } .\
                    filter not exists { ?f dbo:wikiPageRedirects ?w } .\
                }"

        results = query_sparql(query)
        ret = ['<' + r['f']['value'] + '>' for r in results]
        if len(ret) > 0:
            return ret
        else:
            logging.getLogger('root.features').warning(
                f"{uri} had a valid {attribute} value, but I was not able to associate a city to it."
            )
Beispiel #3
0
def get_disambiguating_uri(uri):
    """given the uri, if that is a disambiguation page, returns the a list
       of uris to which it disambiguates to, otherwise it returns a void list

    Arguments:
        uri {str} -- 

    Returns:
        list -- 
    """
    q = "SELECT ?s WHERE{" + uri + "dbo:wikiPageDisambiguates ?s . }"
    results = query_sparql(q)
    disambiguating_uri = [f"<{r['s']['value']}>" for r in results]
    return disambiguating_uri
Beispiel #4
0
def get_redirecting_uri(uri):
    """if this uri redirects to something, then return the redirecting uri, otherwise return the previous uri

    Arguments:
        uri {str} -- 
    """
    q = "SELECT ?s WHERE{" + uri + " dbo:wikiPageRedirects ?s . }"
    results = query_sparql(q)
    if len(results) > 0:
        assert len(
            results
        ) == 1, f"The uri {uri} has more that two redirecting pages, behaviour not expected!"
        redirecting_uri = f"<{results[0]['s']['value']}>"
        return redirecting_uri
    else:
        return uri
def identify_country_uri_attribute(uri, attribute):
    """Retrieve the dbpedia pages of the countries associated with a dbpedia page and
       an attribute contained in that page.

       The attribute can represent a general location. The feature checks if its value or the value of one of its
       parents represent a Country or not. eg: United Kingdom, Italy, Ireland ..

    Arguments:
        uri {str} --
        attribute {str} --

    Returns:
        list -- uris of the pages of the country
    """

    # The second last two conditions filter out country of the past times that can be catched every now and then
    query = "select distinct ?f where{\
                " + uri + " " + attribute + "  ?s .\
                ?s (dbo:wikiPageRedirects|dbo:wikiPageDisambiguates)* ?r .\
                ?r (dbo:country|dbo:isPartOf|dbo:state|dbo:region|dbo:archipelago)* ?u .\
                ?u (dbo:wikiPageRedirects|dbo:wikiPageDisambiguates)* ?f .\
                filter not exists { ?f dbo:dissolutionYear ?w } .\
                filter not exists { ?f dct:subject dbc:Imperialism } .\
                filter not exists { ?f dbo:wikiPageDisambiguates ?w } .\
                filter not exists { ?f dbo:wikiPageRedirects ?w } .\
            }"
    results = query_sparql(query)
    if len(results) > 0:
        ret = []
        results = [
            f"<{r['f']['value']}>" for r in results if r['f']['type'] == 'uri']
        for page in results:
            # USA not an interesting country, too common
            if page != '<http://dbpedia.org/resource/United_States>' and page not in ret:
                q = "ask{\
                        {" + page + " a dbo:Country}\
                        UNION\
                        {" + page + " dct:subject dbc:States_of_the_United_States}\
                    }"
                if ask_sparql(q):
                    ret.append(page)

        if len(ret) > 0:
            return ret
        else:
            logging.getLogger('root.features').warning(
                f"{uri} had a valid {attribute} value, but I was not able to associate a country to it.")
def identify_date_attribute(uri, attribute):
    """Given a dbpedia page and an attribute, we extract a date value from that attribute

    Arguments:
        uri {str} -- 
        attribute {str} -- 
    """
    query = 'SELECT DISTINCT ?s WHERE{' + \
        uri + ' ' + attribute + ' ?s . }'
    results = query_sparql(query)
    if len(results) > 0:
        date = results[0]['s']['value']
        try:
            date_converted = pd.to_datetime(date)
            return date_converted
        except Exception:
            logging.getLogger('root.features').warning(
                f"{uri} had a valid {attribute} value, but I was not able to extract a date from it."
            )
def get_abstract_uri(uri):
    # return 'debug'
    return query_sparql('select ?a where {'+ uri + ' dbo:abstract ?a . FILTER(langMatches(lang(?a),"en")) }')[0]['a']['value']
def search_label_space(label,
                       narrowing_space_query='',
                       selecting_results_query=''):
    """Searches in the whole dbpedia for the uris with the label which matches the variable label

       TODO: Beyoncé could not be found as label string, eventhough this is the label name of the actual page of the singer. 
             Neither substrings of the former cannot be found (Beyon or Beyo), as if the page couldn't be seen from sparql. 
             The same happens with the page of Aminé, José González, Jack Ü, Zhané, Björk. However, the dbpedia page is accessible.
             Apparently, we have problems with strange accents 

       This method guarantees that the uris found are neither disambiguation nor redirection pages

    Arguments:
        label {string} --
        narrowing_space_query {string} -- Query that define variable ?s. By default on this method,
                                          it can range on the space of all labels in dbpedia
        selection_results_query {string} -- Query that poses condition on variable ?f, selectiong
                                            the results that are returned by this method

    Returns:
        list -- uris found
    """
    preprocessed_label = _preprocess_label(label)
    if len(preprocessed_label):

        q = "SELECT DISTINCT ?f { " + narrowing_space_query + \
            " ?s rdfs:label ?label . FILTER(lang(?label)=\"en\") . ?label bif:contains "
        acc = ""
        for idx, token in enumerate(preprocessed_label.split(' ')):
            acc += f"'{token}'" if idx == 0 else f" and '{token}'"
        q += f"\"{acc}\" . "

        q += " ?s (dbo:wikiPageRedirects | dbo:wikiPageDisambiguates)* ?f . "

        q += " ?f rdfs:label ?l . FILTER(lang(?l)=\"en\") . ?l bif:contains "
        acc = ""
        for idx, token in enumerate(preprocessed_label.split(' ')):
            acc += f"'{token}'" if idx == 0 else f" and '{token}'"
        q += f"\"{acc}\" . "

        q += selecting_results_query

        q += "filter not exists { \
              ?f dbo:wikiPageRedirects|dbo:wikiPageDisambiguates ?dis \
              }"

        q += " }"
        results = query_sparql(q)
        uris_found = [
            f"<{results[c]['f']['value']}>" for c in range(len(results))
        ]

        # check, the uris found should not redirect or disambiguate
        for uri in uris_found:
            assert not ask_sparql(
                'ask { {' + uri + ' dbo:wikiPageRedirects ?w } UNION {' + uri +
                ' dbo:wikiPageDisambiguates ?w } }'
            ), "The result is not expect to be neither a disambiguation nor a redirection page, something is incoherent"

        return uris_found
    else:
        return []