Esempio n. 1
0
    def query(self, msg, queryString: str, limit=None) -> list:
        '''
        get the query result
        
        Args:
            msg(str): the profile message to display
            queryString(str): the query to execute
            
        Return:
            list: the list of dicts with the result
        '''
        profile = Profiler(msg, profile=self.profile)
        wd = SPARQL(self.endpoint)
        limitedQuery = queryString
        if limit is not None:
            limitedQuery = f"{queryString} LIMIT {limit}"
        results = wd.query(limitedQuery)
        lod = wd.asListOfDicts(results)
        for record in lod:
            for key in list(record.keys()):
                value = record[key]
                if isinstance(value, str):
                    if value.startswith("http://www.wikidata.org/"):
                        record[key] = self.getWikidataId(value)
                    if key.lower().endswith("coord"):
                        lat, lon = Wikidata.getCoordinateComponents(value)
                        record["lat"] = lat
                        record["lon"] = lon
                        record.pop(key)

        profile.time(f"({len(lod)})")
        return lod
Esempio n. 2
0
    def getCities(self, region=None, country=None):
        '''
        get the cities from Wikidata
        '''
        if region is not None:
            values = "VALUES ?region { wd:%s }" % region
        if country is not None:
            values = "VALUES ?country { wd:%s}" % country
        queryString = """# get a list of cities for the given region
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT DISTINCT ?city ?cityLabel ?geoNameId ?cityPop ?cityCoord ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita
WHERE {  
  # administrative unit of first order
  # example DE-NW Q1198
  %s
  #?region wdt:P31/wdt:P279* wd:Q10864048.
  ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en").
  # isocode state/province
  OPTIONAL { ?region wdt:P300 ?regionIsoCode. }
  # country this region belongs to
  ?region wdt:P17 ?country .
  # label for the country
  ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
  # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
  ?country wdt:P297 ?countryIsoCode.
  # population of country
  ?country wdt:P1082 ?countryPopulation.
  OPTIONAL {
     ?country wdt:P2132 ?countryGdpPerCapita.
  }
  # located in administrative territory
  # https://www.wikidata.org/wiki/Property:P131
  ?city wdt:P131* ?region.
  # label of the City
  ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en").
  # instance of human settlement https://www.wikidata.org/wiki/Q486972
  ?city wdt:P31/wdt:P279* wd:Q486972 .
  # geoName Identifier
  ?city wdt:P1566 ?geoNameId.
  # population of city
  OPTIONAL { ?city wdt:P1082 ?cityPop.}
   # get the coordinates
  OPTIONAL { 
    select (max(?coord) as ?cityCoord) where {
      ?city wdt:P625 ?coord.
    }
  } 
} 
ORDER BY ?cityLabel""" % values
        wd = SPARQL(self.endpoint)
        results = wd.query(queryString)
        cityList = wd.asListOfDicts(results)
        return cityList
    def fromWikiData(self,endpoint):
        '''
        get the country List from WikiData

        Args:
            endpoint(string): the url of the endpoint to be used
         Returns:
            list: and sets it as self.countryList as a side effect
        '''
        wd=SPARQL(endpoint)
        queryString="""
# get a list countries with the corresponding ISO code
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
SELECT ?country ?countryLabel ?shortName (MAX(?pop) as ?population) ?gdpPerCapita ?coord ?isocode
WHERE
{
  # instance of country
  ?country wdt:P31 wd:Q3624078.
  OPTIONAL {
     ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
   }
  OPTIONAL {
      ?country p:P1813 ?shortNameStmt. # get the short name statement
      ?shortNameStmt ps:P1813 ?shortName # the the short name value from the statement
      filter (lang(?shortName) = "en") # filter for English short names only
      filter not exists {?shortNameStmt pq:P31 wd:Q28840786} # ignore flags (aka emojis)
  }
  OPTIONAL {
    # get the population
     # https://www.wikidata.org/wiki/Property:P1082
     ?country wdt:P1082 ?pop.
  }
 OPTIONAL {
     # get the gross domestic product per capita
     ?country wdt:P2132 ?gdpPerCapita.
  }
  # get the iso countryCode
  { ?country wdt:P297 ?isocode }.
  # get the coordinate
  OPTIONAL { ?country wdt:P625 ?coord }.
}
GROUP BY ?country ?countryLabel ?shortName ?population ?gdpPerCapita ?coord ?isocode
ORDER BY ?countryLabel"""
        results=wd.query(queryString)
        self.countryList=wd.asListOfDicts(results)
        for country in self.countryList:
            country['wikidataurl']=country.pop('country')
            country['name']=country.pop('countryLabel')
            super().setNone(country,['shortName','gdpPerCapita'])
        return self.countryList
Esempio n. 4
0
 def test_query_with_authentication(self):
     """tests querying an endpoint that requires authentication"""
     query = """SELECT * WHERE { ?proceeding dblp:publishedInSeriesVolume "2816" .}"""
     sparql = SPARQL("http://localhost:5820/dblp/query", method="POST")
     self.assertRaises(SPARQLExceptions.Unauthorized,
                       sparql.queryAsListOfDicts,
                       queryString=query)
     sparql.addAuthentication("admin", "admin")
     qres = sparql.queryAsListOfDicts(query)
     self.assertEqual(2, len(qres))
Esempio n. 5
0
    def fromRDF(self, endpoint):
        '''
        retrieve my event list from the given SPARQL endpoint
        '''
        # get SPARQL access to GND data
        print(
            "Retrieving %s events from SPARQL endpoint %s\n  ... this might take a few minutes ..."
            % (self.em.title, endpoint))
        starttime = time.time()
        gndEp = SPARQL(endpoint)
        queryString = """# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  ?event ?eventId ?acronym  ?variant ?title ?date ?areaCode ?place ?topic ?homepage 
WHERE {
  ?event a gnd:ConferenceOrEvent.
  ?event gnd:gndIdentifier ?eventId.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
  OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?title.}
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
  OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
  OPTIONAL { ?event gnd:topic ?topic. }
  { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000"""
        results = gndEp.query(queryString)
        eventList = gndEp.asListOfDicts(results)
        print("retrieved %d events in %6.1f s" %
              (len(eventList), time.time() - starttime))
        for rawevent in eventList:
            rawevent['url'] = rawevent.pop('event')
            fields = [
                'eventId', 'variant', 'name', 'areaCode', 'url', 'source',
                'date', 'startDate', 'endDate', 'year', 'place', 'acronym',
                'lookupAcronym', 'topic', 'homepage'
            ]
            self.em.setNone(rawevent, fields)
            dateStr = rawevent['date']
            for key, value in GND.getDateRange(dateStr).items():
                rawevent[key] = value
            event = Event()
            event.fromDict(rawevent)
            event.source = self.em.name
            self.em.add(event)
        self.em.store(sampleRecordCount=10000)
Esempio n. 6
0
    def getCityPopulations(self, profile=True):
        '''
        get the city populations from Wikidata
        
        Args:
            profile(bool): if True show profiling information
        '''
        queryString = """
# get a list of human settlements having a geoName identifier
# to add to geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15        
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT ?city ?cityLabel ?cityPop ?geoNameId ?country ?countryLabel ?countryIsoCode ?countryPopulation
WHERE {
  # geoName Identifier
  ?city wdt:P1566 ?geoNameId.
  # instance of human settlement https://www.wikidata.org/wiki/Q486972
  ?city wdt:P31/wdt:P279* wd:Q486972 .
  # population of city
  OPTIONAL { ?city wdt:P1082 ?cityPop.}

  # label of the City
  ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en").
  # country this city belongs to
  ?city wdt:P17 ?country .
  # label for the country
  ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
  # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
  ?country wdt:P297 ?countryIsoCode.
  # population of country
  ?country wdt:P1082 ?countryPopulation.
  OPTIONAL {
     ?country wdt:P2132 ?countryGdpPerCapita.
  }
}"""
        if profile:
            print(
                "getting cities with population and geoNamesId from wikidata endpoint %s"
                % self.endpoint)
        starttime = time.time()
        wd = SPARQL(self.endpoint)
        results = wd.query(queryString)
        cityList = wd.asListOfDicts(results)
        if profile:
            print("Found %d cities  in %5.1f s" %
                  (len(cityList), time.time() - starttime))
        return cityList
 def testGetItemsByLabel(self):
     '''
     try getting items by label
     '''
     debug = self.debug
     #debug=True
     qLabels = [
         "academic conference", "scientific conference series",
         "whisky distillery", "human"
     ]
     for endpointConf in self.endpointConfs:
         try:
             sparql = SPARQL(endpointConf.endpoint,
                             method=endpointConf.method)
             items = {}
             for qLabel in qLabels:
                 items4Label = WikidataItem.getItemsByLabel(sparql, qLabel)
                 for i, item in enumerate(items4Label):
                     if debug:
                         print(f"{endpointConf.name} {i+1}:{item}")
                 items[qLabel] = items4Label[0]
             for qLabel in qLabels:
                 self.assertTrue(qLabel in items)
         except (Exception, HTTPError) as ex:
             self.handleServiceUnavailable(ex, endpointConf)
             pass
Esempio n. 8
0
    def getLabelAndDescription(cls,
                               sparql: SPARQL,
                               itemId: str,
                               lang: str = "en"):
        '''
        get  the label for the given item and language
        
        Args:
            itemId(str): the wikidata Q/P id
            lang(str): the language of the label 
            
        Returns:
            (str,str): the label and description as a tuple
        '''
        query = f"""# get the label for the given item
{cls.getPrefixes(["rdfs","wd","schema"])}        
SELECT ?itemLabel ?itemDescription
WHERE
{{
  VALUES ?item {{
    wd:{itemId}
  }}
  ?item rdfs:label ?itemLabel.
  FILTER (LANG(?itemLabel) = "{lang}").
  ?item schema:description ?itemDescription.
  FILTER(LANG(?itemDescription) = "{lang}")
}}"""
        return sparql.getValues(query, ["itemLabel", "itemDescription"])
Esempio n. 9
0
 def testControlEscape(self):
     '''
     check the control-escaped version of an UTF-8 string
     '''
     controls = "Α\tΩ\r\n"
     expected = "Α\\tΩ\\r\\n"
     esc = SPARQL.controlEscape(controls)
     self.assertEqual(expected, esc)
Esempio n. 10
0
    def testStackoverflow55961615Query(self):
        '''
        see 
        https://stackoverflow.com/questions/55961615/how-to-integrate-wikidata-query-in-python
        https://stackoverflow.com/a/69771615/1497139
        '''
        qlod = None
        try:
            endpoint = "https://query.wikidata.org/sparql"
            wd = SPARQL(endpoint)
            queryString = """SELECT ?s ?sLabel ?item ?itemLabel ?sourceCode ?webSite ?stackexchangeTag  {
    SERVICE wikibase:mwapi {
        bd:serviceParam wikibase:api "EntitySearch".
        bd:serviceParam wikibase:endpoint "www.wikidata.org".
        bd:serviceParam mwapi:search "natural language processing".
        bd:serviceParam mwapi:language "en".
        ?item wikibase:apiOutputItem mwapi:item.
        ?num wikibase:apiOrdinal true.
    }
    ?s wdt:P279|wdt:P31 ?item .
    OPTIONAL { 
      ?s wdt:P1324 ?sourceCode.
    }
    OPTIONAL {    
      ?s wdt:P856 ?webSite.
    }
    OPTIONAL {    
      ?s wdt:P1482 ?stackexchangeTag.
    }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
}
ORDER BY ?itemLabel ?sLabel"""
            qlod = wd.queryAsListOfDicts(queryString, fixNone=True)
        except Exception as ex:
            print(f"{endpoint} access failed with {ex}- could not run test")

        if qlod is not None:
            query = Query(name="EntitySearch",
                          query=queryString,
                          lang='sparql')
            debug = self.debug
            for tablefmt in ["github", "mediawiki", "latex"]:
                qdoc = query.documentQueryResult(qlod, tablefmt=tablefmt)
                if debug:
                    print(qdoc)
Esempio n. 11
0
 def testIssue7(self):
     '''
     test conversion of dates with timezone info
     '''
     values = ["2020-01-01T00:00:00Z", "42000-01-01T00:00:00Z"]
     expected = [datetime.datetime(2020, 1, 1, 0, 0), None]
     for index, value in enumerate(values):
         dt = SPARQL.strToDatetime(value, debug=self.debug)
         self.assertEqual(expected[index], dt)
Esempio n. 12
0
    def fromWikiData(self,endpoint):
        '''
        get the province List from WikiData

        Args:
            endpoint(string): the url of the endpoint to be used
        Returns:
            list: and sets it as self.provinceList as a side effect
        '''
        wd=SPARQL(endpoint)
        queryString="""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT ?region ?isocc ?isocode4 ?regionLabel ?population ?location
WHERE
{
  # administrative unit of first order
  ?region wdt:P31/wdt:P279* wd:Q10864048.
  OPTIONAL {
     ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en").
  }
  # filter historic regions
  # FILTER NOT EXISTS {?region wdt:P576 ?end}
  # get the population
  # https://www.wikidata.org/wiki/Property:P1082
  OPTIONAL { ?region wdt:P1082 ?population. }
  # # https://www.wikidata.org/wiki/Property:P297
  OPTIONAL { ?region wdt:P297 ?isocc. }
  # isocode state/province
  ?region wdt:P300 ?isocode4.
  # https://www.wikidata.org/wiki/Property:P625
  OPTIONAL { ?region wdt:P625 ?location. }
}
ORDER BY (?isocode4)
"""
        results=wd.query(queryString)
        self.provinceList=wd.asListOfDicts(results)
        for province in self.provinceList:
            province['wikidataurl']=province.pop('region')
            province['name']=province.pop('regionLabel')
            super().setNone(province,['population','location'])
        return self.provinceList
Esempio n. 13
0
    def testStats(self):
        if not self.available():
            return
        queries = [
            Query(
                'entities and usage frequency', '''
# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>

SELECT ?c  (COUNT(?c) AS ?count)
WHERE {
  ?subject a  ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
        '''),
            Query(
                'relevance of fields',
                '''# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)''')
        ]
        sparql = SPARQL(self.endpoint)
        for query in queries:
            listOfDicts = sparql.queryAsListOfDicts(query.query)
            markup = query.asWikiMarkup(listOfDicts)
            markup = markup.replace(
                "https://d-nb.info/standards/elementset/gnd", "gnd")
            print("=== %s ===" % query.name)
            print(markup)
Esempio n. 14
0
    def testStackoverflow71444069(self):
        '''
        https://stackoverflow.com/questions/71444069/create-csv-from-result-of-a-for-google-colab/71548650#71548650
        '''
        from lodstorage.sparql import SPARQL
        from lodstorage.csv import CSV
        sparqlQuery = """SELECT ?org ?orgLabel
WHERE
{
  ?org wdt:P31 wd:Q4830453. #instance of organizations
  ?org wdt:P17 wd:Q96. #Mexico country

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en"}
}"""
        sparql = SPARQL("https://query.wikidata.org/sparql")
        qlod = sparql.queryAsListOfDicts(sparqlQuery)
        csv = CSV.toCSV(qlod)
        if self.debug:
            print(csv)
Esempio n. 15
0
 def __init__(self,
              name,
              entityName,
              entityPluralName,
              config=None,
              debug=False):
     '''
     Constructor
     
     Args:
         name(string): name of this eventManager
         entityName(string): entityType to be managed e.g. Country
         entityPluralName(string): plural of the the entityType e.g. Countries
         config(StorageConfig): the configuration to be used if None a default configuration will be used
         debug(boolean): override debug setting when default of config is used via config=None
     '''
     self.name = name
     self.entityName = entityName
     self.entityPluralName = entityPluralName
     if config is None:
         config = StorageConfig.getDefault()
         if config.tableName is None:
             config.tableName = entityName
         if debug:
             config.debug = debug
     self.config = config
     cacheFile = self.getCacheFile(config=config, mode=config.mode)
     self.showProgress("Creating %smanager(%s) for %s using cache %s" %
                       (self.entityName, config.mode, self.name, cacheFile))
     if config.mode is StoreMode.DGRAPH:
         self.dgraph = Dgraph(debug=config.debug,
                              host=config.host,
                              profile=config.profile)
     elif config.mode is StoreMode.SPARQL:
         if config.endpoint is None:
             raise Exception("no endpoint set for mode sparql")
         self.endpoint = config.endpoint
         self.sparql = SPARQL(config.endpoint,
                              debug=config.debug,
                              profile=config.profile)
     elif config.mode is StoreMode.SQL:
         self.executeMany = False  # may be True when issues are fixed
Esempio n. 16
0
    def getRegions(self):
        '''
        get Regions from Wikidata
        
        `try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20regions%0A%23%20for%20geograpy3%20library%0A%23%20see%20https%3A%2F%2Fgithub.com%2Fsomnathrakshit%2Fgeograpy3%2Fissues%2F15%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0ASELECT%20%3Fcountry%20%3FcountryLabel%20%3FcountryIsoCode%20%3Fregion%20%3FregionIsoCode%20%3FregionLabel%20%3Fpopulation%20%3Flocation%0AWHERE%0A%7B%0A%20%20%23%20administrative%20unit%20of%20first%20order%0A%20%20%3Fregion%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ10864048.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%3Fregion%20rdfs%3Alabel%20%3FregionLabel%20filter%20%28lang%28%3FregionLabel%29%20%3D%20%22en%22%29.%0A%20%20%7D%0A%20%20%23%20filter%20historic%20regions%0A%20%20%23%20FILTER%20NOT%20EXISTS%20%7B%3Fregion%20wdt%3AP576%20%3Fend%7D%0A%20%20%23%20get%20the%20population%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP1082%0A%20%20OPTIONAL%20%7B%20%3Fregion%20wdt%3AP1082%20%3Fpopulation.%20%7D%0A%20%20%23%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP297%0A%20%20OPTIONAL%20%7B%20%0A%20%20%20%20%3Fregion%20wdt%3AP17%20%3Fcountry.%0A%20%20%20%20%23%20label%20for%20the%20country%0A%20%20%20%20%3Fcountry%20rdfs%3Alabel%20%3FcountryLabel%20filter%20%28lang%28%3FcountryLabel%29%20%3D%20%22en%22%29.%0A%20%20%20%20%3Fcountry%20wdt%3AP297%20%3FcountryIsoCode.%20%0A%20%20%7D%0A%20%20%23%20isocode%20state%2Fprovince%0A%20%20%3Fregion%20wdt%3AP300%20%3FregionIsoCode.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP625%0A%20%20OPTIONAL%20%7B%20%3Fregion%20wdt%3AP625%20%3Flocation.%20%7D%0A%7D>`_
        '''
        queryString = """# get a list of regions
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?region (max(?regionAlpha2) as ?regionIsoCode) ?regionLabel (max(?population) as ?regionPopulation) ?location
WHERE
{
  # administrative unit of first order
  ?region wdt:P31/wdt:P279* wd:Q10864048.
  OPTIONAL {
     ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en").
  }
  # filter historic regions
  # FILTER NOT EXISTS {?region wdt:P576 ?end}
  # get the population
  # https://www.wikidata.org/wiki/Property:P1082
  OPTIONAL { ?region wdt:P1082 ?population. }
  # # https://www.wikidata.org/wiki/Property:P297
  OPTIONAL { 
    ?region wdt:P17 ?country.
    # label for the country
    ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
    ?country wdt:P297 ?countryIsoCode. 
  }
  # isocode state/province
  ?region wdt:P300 ?regionAlpha2.
  # https://www.wikidata.org/wiki/Property:P625
  OPTIONAL { ?region wdt:P625 ?location. }
} GROUP BY ?country ?countryLabel ?countryIsoCode ?region ?regionIsoCode ?regionLabel ?location
ORDER BY ?regionIsoCode"""
        wd = SPARQL(self.endpoint)
        results = wd.query(queryString)
        self.regionList = wd.asListOfDicts(results)
Esempio n. 17
0
 def test_SPARQL(self):
     '''
     test SPARQL queries
     '''
     # disable test for the time being
     return
     qm=QueryManager(lang='sparql',debug=False)
     self.assertEqual(4,len(qm.queriesByName))
     endpoint="http://localhost:3030/cr"
     sparql=SPARQL(endpoint)
     for name,query in qm.queriesByName.items():
         listOfDicts=sparql.queryAsListOfDicts(query.query)
         markup=query.asWikiMarkup(listOfDicts)
         markup=markup.replace("http://cr.bitplan.com/","https://cr.bitplan.com/index.php/Property:")
         print("== %s ==" % (name))
         print("=== query ===")
         print (query.asWikiSourceMarkup())
         print("=== result ===")
         print(markup)
     pass
Esempio n. 18
0
    def getCountries(self):
        '''
        get a list of countries
        
        `try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20countries%0A%23%20for%20geograpy3%20library%0A%23%20see%20https%3A%2F%2Fgithub.com%2Fsomnathrakshit%2Fgeograpy3%2Fissues%2F15%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20p%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2F%3E%0APREFIX%20ps%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fstatement%2F%3E%0APREFIX%20pq%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fqualifier%2F%3E%0A%23%20get%20City%20details%20with%20Country%0ASELECT%20DISTINCT%20%3Fcountry%20%3FcountryLabel%20%3FcountryIsoCode%20%3FcountryPopulation%20%3FcountryGDP_perCapita%20%3Fcoord%20%20WHERE%20%7B%0A%20%20%23%20instance%20of%20City%20Country%0A%20%20%3Fcountry%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ3624078%20.%0A%20%20%23%20label%20for%20the%20country%0A%20%20%3Fcountry%20rdfs%3Alabel%20%3FcountryLabel%20filter%20%28lang%28%3FcountryLabel%29%20%3D%20%22en%22%29.%0A%20%20%23%20get%20the%20coordinates%0A%20%20%3Fcountry%20wdt%3AP625%20%3Fcoord.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP297%20ISO%203166-1%20alpha-2%20code%0A%20%20%3Fcountry%20wdt%3AP297%20%3FcountryIsoCode.%0A%20%20%23%20population%20of%20country%0A%20%20%3Fcountry%20wdt%3AP1082%20%3FcountryPopulation.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP2132%0A%20%20%23%20nonminal%20GDP%20per%20capita%0A%20%20%3Fcountry%20wdt%3AP2132%20%3FcountryGDP_perCapita.%0A%7D>`_
      
        '''
        queryString = """# get a list of countries
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
# get City details with Country
SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGDP_perCapita ?countryCoord  WHERE {
  # instance of Country
  ?country wdt:P31/wdt:P279* wd:Q6256 .
  # VALUES ?country { wd:Q55}.
  # label for the country
  ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
  # get the coordinates
  OPTIONAL { 
    select (max(?coord) as ?countryCoord) where {
      ?country wdt:P625 ?coord.
    }
  } 
  # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
  ?country wdt:P297 ?countryIsoCode.
  # population of country   
  ?country wdt:P1082 ?countryPopulation.
  # https://www.wikidata.org/wiki/Property:P2132
  # nominal GDP per capita
  OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapita. }
}
ORDER BY ?countryIsoCode"""
        wd = SPARQL(self.endpoint)
        results = wd.query(queryString)
        self.countryList = wd.asListOfDicts(results)
Esempio n. 19
0
 def __init__(self,
              itemQid,
              propertyLabels: list = [],
              propertyIds: list = [],
              subclassPredicate="wdt:P31",
              where: str = None,
              endpointConf=None,
              lang="en",
              debug=False):
     '''
     Constructor
     
     Args:
         itemQid(str): wikidata id of the type to analyze 
         propertyLabels(list): a list of labels of properties to be considered
         propertyIds(list): a list of ids of properties to be considered
         subclassPredicate(str): the subclass Predicate to use
         where(str): extra where clause for instance selection (if any)
         endpoint(str): the url of the SPARQL endpoint to be used
     '''
     self.itemQid = itemQid
     self.debug = debug
     if endpointConf is None:
         endpointConf = Endpoint.getDefault()
     self.endpointConf = endpointConf
     self.sparql = SPARQL(endpointConf.endpoint,
                          method=self.endpointConf.method)
     self.sparql.debug = self.debug
     self.subclassPredicate = subclassPredicate
     self.where = f"\n  {where}" if where is not None else ""
     self.lang = lang
     self.item = WikidataItem(itemQid, sparql=self.sparql, lang=lang)
     self.queryManager = TrulyTabular.getQueryManager(debug=self.debug)
     self.properties = WikidataProperty.getPropertiesByIds(
         self.sparql, propertyIds, lang)
     self.properties.update(
         WikidataProperty.getPropertiesByLabels(self.sparql, propertyLabels,
                                                lang))
     self.isodate = datetime.datetime.now().isoformat()
     self.error = None
Esempio n. 20
0
    def testIssue20And76(self):
        '''
        see https://github.com/WolfgangFahl/pyLoDStorage/issues/20
        add fixNone option to SPARQL results (same functionality as in SQL)
        
         https://github.com/WolfgangFahl/pyLoDStorage/issues/76
        SPARQL GET method support
        '''
        endpoint = "https://query.wikidata.org/sparql"
        for method in ["POST", "GET"]:
            wd = SPARQL(endpoint, method=method)
            queryString = """
        # Conference Series wikidata query
# see https://confident.dbis.rwth-aachen.de/dblpconf/wikidata
# WF 2021-01-30
SELECT ?confSeries ?short_name ?official_website
WHERE 
{
  #  scientific conference series (Q47258130) 
  ?confSeries wdt:P31 wd:Q47258130.
  OPTIONAL { 
    ?confSeries wdt:P1813 ?short_name . 
  }
  #  official website (P856) 
  OPTIONAL {
    ?confSeries wdt:P856 ?official_website
  } 
}
LIMIT 200
"""
            lod = wd.queryAsListOfDicts(queryString, fixNone=True)
            fields = LOD.getFields(lod)
            if self.debug:
                print(fields)
            for row in lod:
                for field in fields:
                    self.assertTrue(field in row)
Esempio n. 21
0
    def testWikdata(self):
        '''
        check wikidata
        '''
        # check we have local wikidata copy:
        #if getpass.getuser()=="wf":
        #    # use 2018 wikidata copy
        #    endpoint="http://jena.zeus.bitplan.com/wikidata/"
        endpoint = "https://query.wikidata.org/sparql"
        wd = SPARQL(endpoint)
        queryString = """# get a list of whisky distilleries
PREFIX wd: <http://www.wikidata.org/entity/>            
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?item ?coord 
WHERE 
{
  # instance of whisky distillery
  ?item wdt:P31 wd:Q10373548.
  # get the coordinate
  ?item wdt:P625 ?coord.
}
"""
        results = wd.query(queryString)
        self.assertTrue(238 <= len(results))
Esempio n. 22
0
    def testSparqlQueries(self):
        '''
        test SPARQL queries 
        '''
        show = self.debug
        #show=True
        qm = QueryManager(lang='sparql', debug=False)
        for name, query in qm.queriesByName.items():
            if name in ["US President Nicknames"]:
                if show:
                    print(f"{name}:{query}")
                endpoint = SPARQL(query.endpoint)
                try:
                    qlod = endpoint.queryAsListOfDicts(query.query)
                    for tablefmt in ["mediawiki", "github", "latex"]:
                        doc = query.documentQueryResult(qlod,
                                                        tablefmt=tablefmt,
                                                        floatfmt=".0f")
                        docstr = doc.asText()
                        if show:
                            print(docstr)

                except Exception as ex:
                    print(f"{query.title} at {query.endpoint} failed: {ex}")
Esempio n. 23
0
    def getItemsByLabel(cls,
                        sparql: SPARQL,
                        itemLabel: str,
                        lang: str = "en") -> list:
        '''
        get a Wikidata items by the given label
        
        Args:
            sparql(SPARQL): the SPARQL endpoint to use
            itemLabel(str): the label of the items
            lang(str): the language of the label
            
        Returns:
            a list of potential items
        '''
        valuesClause = f'   "{itemLabel}"@{lang}\n'
        query = f"""# get the items that have the given label in the given language
# e.g. we'll find human=Q5 as the oldest type for the label "human" first
# and then the newer ones such as "race in Warcraft"
{cls.getPrefixes(["rdfs","schema","xsd"])}
SELECT 
  #?itemId 
  ?item 
  ?itemLabel 
  ?itemDescription
WHERE {{ 
  VALUES ?itemLabel {{
    {valuesClause}
  }}
  #BIND (xsd:integer(SUBSTR(STR(?item),33)) AS ?itemId)
  ?item rdfs:label ?itemLabel. 
  ?item schema:description ?itemDescription.
  FILTER(LANG(?itemDescription)="{lang}")
}} 
#ORDER BY ?itemId"""
        qLod = sparql.queryAsListOfDicts(query)
        items = []
        for record in qLod:
            url = record["item"]
            qid = re.sub(r"http://www.wikidata.org/entity/(.*)", r"\1", url)
            item = WikidataItem(qid)
            item.url = url
            item.qlabel = record["itemLabel"]
            item.varname = Variable.validVarName(item.qlabel)
            item.description = record["itemDescription"]
            items.append(item)
        sortedItems = sorted(items, key=lambda item: item.qnumber)
        return sortedItems
Esempio n. 24
0
 def getJena(self,
             mode='query',
             debug=False,
             typedLiterals=False,
             profile=False):
     '''
     get the jena endpoint for the given mode
     
     Args:
        mode(string): query or update
        debug(boolean): True if debug information should be output
        typedLiterals(boolean): True if INSERT DATA SPARQL commands should use typed literals
        profile(boolean): True if profile/timing information should be shown
     '''
     endpoint = "http://localhost:3030/example"
     jena = SPARQL(endpoint,
                   mode=mode,
                   debug=debug,
                   typedLiterals=typedLiterals,
                   profile=profile)
     return jena
 def testGetPropertiesById(self):
     '''
     try getting properties by label
     '''
     debug = self.debug
     #debug=True
     propertyIds = ["P1800"]
     expected = ["Wikimedia database name"]
     for endpointConf in self.endpointConfs:
         try:
             sparql = SPARQL(endpointConf.endpoint,
                             method=endpointConf.method)
             propList = WikidataProperty.getPropertiesByIds(sparql,
                                                            propertyIds,
                                                            lang="en")
             for i, prop in enumerate(propList):
                 if debug:
                     print(f"{endpointConf.name} {i}:{prop}")
                 self.assertEqual(prop, expected[i])
         except (Exception, HTTPError) as ex:
             self.handleServiceUnavailable(ex, endpointConf)
             pass
Esempio n. 26
0
class TrulyTabular(object):
    '''
    truly tabular SPARQL/RDF analysis
    
    checks "how tabular" a query based on a list of properties of an itemclass is
    '''
    def __init__(self,
                 itemQid,
                 propertyLabels: list = [],
                 propertyIds: list = [],
                 subclassPredicate="wdt:P31",
                 where: str = None,
                 endpointConf=None,
                 lang="en",
                 debug=False):
        '''
        Constructor
        
        Args:
            itemQid(str): wikidata id of the type to analyze 
            propertyLabels(list): a list of labels of properties to be considered
            propertyIds(list): a list of ids of properties to be considered
            subclassPredicate(str): the subclass Predicate to use
            where(str): extra where clause for instance selection (if any)
            endpoint(str): the url of the SPARQL endpoint to be used
        '''
        self.itemQid = itemQid
        self.debug = debug
        if endpointConf is None:
            endpointConf = Endpoint.getDefault()
        self.endpointConf = endpointConf
        self.sparql = SPARQL(endpointConf.endpoint,
                             method=self.endpointConf.method)
        self.sparql.debug = self.debug
        self.subclassPredicate = subclassPredicate
        self.where = f"\n  {where}" if where is not None else ""
        self.lang = lang
        self.item = WikidataItem(itemQid, sparql=self.sparql, lang=lang)
        self.queryManager = TrulyTabular.getQueryManager(debug=self.debug)
        self.properties = WikidataProperty.getPropertiesByIds(
            self.sparql, propertyIds, lang)
        self.properties.update(
            WikidataProperty.getPropertiesByLabels(self.sparql, propertyLabels,
                                                   lang))
        self.isodate = datetime.datetime.now().isoformat()
        self.error = None

    def __str__(self):
        '''
        Returns:
            str: my text representation
        '''
        return self.asText(long=False)

    def count(self):
        '''
        get my count
        '''
        itemText = self.getItemText()
        query = f"""# Count all items with the given type
# {itemText}
{WikidataItem.getPrefixes()}
SELECT (COUNT (DISTINCT ?item) AS ?count)
WHERE
{{
  # instance of {self.item.qlabel}
  ?item {self.subclassPredicate} wd:{self.item.qid}.{self.where}
}}"""
        try:
            count = self.sparql.getValue(query, "count")
            # workaround https://github.com/ad-freiburg/qlever/issues/717
            count = int(count)
        except Exception as ex:
            self.error = ex
            count = None

        return count, query

    def asText(self, long: bool = True):
        '''
        returns my content as a text representation
        
        Args:
            long(bool): True if a long format including url is wished
            
        Returns:
            str: a text representation of my content
        '''
        text = self.item.asText(long)
        return text

    def getItemText(self):
        # leads to 405 Method not allowed in SPARQLWrapper under certain circumstances
        # itemText=self.asText(long=True)
        itemText = f"{self.itemQid}:{self.item.qlabel}"
        return itemText

    @classmethod
    def getQueryManager(cls, lang='sparql', name="trulytabular", debug=False):
        '''
        get the query manager for the given language and fileName
        
        Args:
            lang(str): the language of the queries to extract
            name(str): the name of the manager containing the query specifications
            debug(bool): if True set debugging on
        '''
        qYamlFileName = f"{name}.yaml"
        for qYamlFile in YamlPath.getPaths(qYamlFileName):
            if os.path.isfile(qYamlFile):
                qm = QueryManager(lang=lang,
                                  debug=debug,
                                  queriesPath=qYamlFile)
                return qm
        return None

    def generateSparqlQuery(self,
                            genMap: dict,
                            listSeparator: str = "⇹",
                            naive: bool = True,
                            lang: str = 'en') -> str:
        '''
        generate a SPARQL Query
        
        Args:
            genMap(dict): a dictionary of generation items aggregates/ignores/labels
            listSeparator(str): the symbole to use as a list separator for GROUP_CONCAT
            naive(bool): if True - generate a naive straight forward SPARQL query
                if False generate a proper truly tabular aggregate query
            lang(str): the language to generate for
            
        Returns:
            str: the generated SPARQL Query
        '''
        # The Wikidata item to generate the query for
        item = self.item
        # the name of this script
        script = Path(__file__).name
        # the mode of generation
        naiveText = "naive" if naive else "aggregate"
        # start with th preamble and PREFIX section
        # select the item and it's label
        sparqlQuery = f"""# truly tabular {naiveText} query for 
# {item.qid}:{item.qlabel}
# generated by {script} version {Version.version} on {self.isodate}
{WikidataItem.getPrefixes()}
SELECT ?{item.itemVarname} ?{item.labelVarname}"""
        # loop over all properties
        for wdProp in self.properties.values():
            if naive:
                sparqlQuery += f"\n  ?{wdProp.valueVarname}"
            else:
                if wdProp.pid in genMap:
                    genList = genMap[wdProp.pid]
                    for aggregate in genList:
                        if not aggregate in ["ignore", "label"]:
                            distinct = ""
                            if aggregate == "list":
                                aggregateFunc = "GROUP_CONCAT"
                                aggregateParam = f';SEPARATOR="{listSeparator}"'
                                distinct = "DISTINCT "
                            else:
                                if aggregate == "count":
                                    distinct = "DISTINCT "
                                aggregateFunc = aggregate.upper()
                                aggregateParam = ""
                            sparqlQuery += f"\n  ({aggregateFunc} ({distinct}?{wdProp.valueVarname}{aggregateParam}) AS ?{wdProp.valueVarname}_{aggregate})"
                        elif aggregate == "label":
                            sparqlQuery += f"\n  ?{wdProp.labelVarname}"
                        elif aggregate == "ignore" and not "label" in genList:
                            sparqlQuery += f"\n  ?{wdProp.valueVarname}"
        sparqlQuery += f"""
WHERE {{
  # instanceof {item.qid}:{item.qlabel}
  ?{item.itemVarname} {self.subclassPredicate} wd:{item.qid}.
  # label
  ?{item.itemVarname} rdfs:label ?{item.labelVarname}.  
  FILTER (LANG(?{item.labelVarname}) = "{lang}").
"""
        for wdProp in self.properties.values():
            sparqlQuery += f"""  # {wdProp}
  OPTIONAL {{ 
    ?{item.itemVarname} wdt:{wdProp.pid} ?{wdProp.valueVarname}. """
            if wdProp.pid in genMap:
                genList = genMap[wdProp.pid]
                if "label" in genList:
                    sparqlQuery += f"""\n    ?{wdProp.valueVarname} rdfs:label ?{wdProp.labelVarname}."""
                    sparqlQuery += f"""\n    FILTER (LANG(?{wdProp.labelVarname}) = "{lang}")."""
            sparqlQuery += "\n  }\n"
        # close where Clause
        sparqlQuery += """}\n"""
        # optionally add Aggregate
        if not naive:
            sparqlQuery += f"""GROUP BY
  ?{item.itemVarname} 
  ?{item.labelVarname}
"""
            for wdProp in self.properties.values():
                if wdProp.pid in genMap:
                    genList = genMap[wdProp.pid]
                    if "label" in genList:
                        sparqlQuery += f"\n  ?{wdProp.labelVarname}"
                    if "ignore" in genList and not "label" in genList:
                        sparqlQuery += f"\n  ?{wdProp.valueVarname}"
            havingCount = 0
            havingDelim = "   "
            for wdProp in self.properties.values():
                if wdProp.pid in genMap:
                    genList = genMap[wdProp.pid]
                    if "ignore" in genList:
                        havingCount += 1
                        if havingCount == 1:
                            sparqlQuery += f"\nHAVING ("

                        sparqlQuery += f"\n  {havingDelim}COUNT(?{wdProp.valueVarname})<=1"
                        havingDelim = "&& "
            if havingCount > 0:
                sparqlQuery += f"\n)"
        return sparqlQuery

    def mostFrequentPropertiesQuery(self,
                                    whereClause: str = None,
                                    minCount: int = 0):
        '''
        get the most frequently used properties
        
        Args:
            whereClause(str): an extra WhereClause to use
        '''
        if whereClause is None:
            whereClause = f"?item {self.subclassPredicate} wd:{self.itemQid}"
            if self.endpointConf.database != "qlever":
                whereClause += ";?p ?id"
        whereClause += "."
        minCountFilter = ""
        if minCount > 0:
            minCountFilter = f"\n  FILTER(?count >{minCount})."
        itemText = self.getItemText()
        sparqlQuery = f"""# get the most frequently used properties for
# {itemText}
{WikidataItem.getPrefixes()}
SELECT ?prop ?propLabel ?wbType ?count WHERE {{
  {{"""
        if self.endpointConf.database == "qlever":
            sparqlQuery += f"""
    SELECT ?p (COUNT(DISTINCT ?item) AS ?count) WHERE {{"""
        else:
            sparqlQuery += f"""
    SELECT ?prop (COUNT(DISTINCT ?item) AS ?count) WHERE {{"""
        if self.endpointConf.database == "blazegraph":
            sparqlQuery += f"""
      hint:Query hint:optimizer "None"."""
        sparqlQuery += f"""
      {whereClause}"""
        if self.endpointConf.database == "qlever":
            sparqlQuery += f"""  
      ?item ql:has-predicate ?p 
    }} GROUP BY ?p
  }}
  ?prop wikibase:directClaim ?p."""
        else:
            sparqlQuery += f"""
      ?prop wikibase:directClaim ?p.
    }}
    GROUP BY ?prop ?propLabel
  }}"""
        sparqlQuery += f"""
  ?prop rdfs:label ?propLabel.
  ?prop wikibase:propertyType ?wbType.
  FILTER(LANG(?propLabel) = "{self.lang}").{minCountFilter}  
}}
ORDER BY DESC (?count)
"""
        title = f"most frequently used properties for {self.item.asText(long=True)}"
        query = Query(name=f"mostFrequentProperties for {itemText}",
                      query=sparqlQuery,
                      title=title)
        return query

    def noneTabularQuery(self,
                         wdProperty: WikidataProperty,
                         asFrequency: bool = True):
        '''
        get the none tabular entries for the given property
        
        Args:
            wdProperty(WikidataProperty): the property to analyze
            asFrequency(bool): if true do a frequency analysis
        '''
        propertyLabel = wdProperty.plabel
        propertyId = wdProperty.pid
        # work around https://github.com/RDFLib/sparqlwrapper/issues/211
        if "described at" in propertyLabel:
            propertyLabel = propertyLabel.replace("described at",
                                                  "describ'd at")
        sparql = f"""SELECT ?item ?itemLabel (COUNT (?value) AS ?count)
WHERE
{{
  # instance of {self.item.qlabel}
  ?item {self.subclassPredicate} wd:{self.itemQid}.{self.where}
  ?item rdfs:label ?itemLabel.
  FILTER (LANG(?itemLabel) = "{self.lang}").
  # {propertyLabel}
  ?item {wdProperty.getPredicate()} ?value.
}} GROUP BY ?item ?itemLabel
"""
        if asFrequency:
            freqDesc = "frequencies"
            sparql = f"""SELECT ?count (COUNT(?count) AS ?frequency) WHERE {{{{
{sparql}
}}}}
GROUP BY ?count
ORDER BY DESC (?frequency)"""
        else:
            freqDesc = "records"
            sparql = f"""{sparql}
HAVING (COUNT (?value) > 1)
ORDER BY DESC(?count)"""
        itemText = self.getItemText()
        sparql = f"""# Count all {itemText} items
# with the given {propertyLabel}({propertyId}) https://www.wikidata.org/wiki/Property:{propertyId} 
{WikidataItem.getPrefixes()}
""" + sparql
        title = f"non tabular entries for {self.item.qlabel}/{propertyLabel}:{freqDesc}"
        name = f"NonTabular {self.item.qlabel}/{propertyLabel}:{freqDesc}"
        query = Query(query=sparql, name=name, title=title)
        return query

    def noneTabular(self, wdProperty: WikidataProperty):
        '''
        get the none tabular result for the given Wikidata property
        
        Args:
            wdProperty(WikidataProperty): the Wikidata property
        '''
        query = self.noneTabularQuery(wdProperty)
        if self.debug:
            logging.info(query.query)
        qlod = self.sparql.queryAsListOfDicts(query.query)
        return qlod

    def addStatsColWithPercent(self, m: dict, col: str,
                               value: Union[int, float], total: Union[int,
                                                                      float]):
        '''
        add a statistics Column
        Args:
            m(dict):
            col(str): name of the column
            value: value
            total: total value
        '''
        m[col] = value
        if total is not None and total > 0:
            m[f"{col}%"] = float(f"{value/total*100:.1f}")
        else:
            m[f"{col}%"] = None

    def genWdPropertyStatistic(self,
                               wdProperty: WikidataProperty,
                               itemCount: int,
                               withQuery=True) -> dict:
        '''
        generate a property Statistics Row for the given wikidata Property
        
        Args:
            wdProperty(WikidataProperty): the property to get the statistics for
            itemCount(int): the total number of items to check
            withQuery(bool): if true include the sparql query
            
        Returns:
            dict: a statistics row
        '''
        ntlod = self.noneTabular(wdProperty)
        statsRow = {"property": wdProperty.plabel}
        total = 0
        nttotal = 0
        maxCount = 0
        for record in ntlod:
            f = int(record["frequency"])
            count = int(record["count"])
            #statsRow[f"f{count}"]=f
            if count > 1:
                nttotal += f
            else:
                statsRow["1"] = f
            if count > maxCount:
                maxCount = count
            total += f
        statsRow["maxf"] = maxCount
        if withQuery:
            statsRow["queryf"] = self.noneTabularQuery(wdProperty).query
            statsRow["queryex"] = self.noneTabularQuery(
                wdProperty, asFrequency=False).query
        self.addStatsColWithPercent(statsRow, "total", total, itemCount)
        self.addStatsColWithPercent(statsRow, "non tabular", nttotal, total)
        return statsRow

    def genPropertyStatistics(self):
        '''
        generate the property Statistics
        
        Returns:
            generator: a generator of statistic dict rows
        '''
        itemCount, _itemCountQuery = self.count()
        for wdProperty in self.properties.values():
            statsRow = self.genWdPropertyStatistic(wdProperty, itemCount)
            yield statsRow

    def getPropertyStatistics(self):
        '''
        get the property Statistics
        '''
        itemCount, _itemCountQuery = self.count()
        lod = [{"property": "∑", "total": itemCount, "total%": 100.0}]
        for wdProperty in self.properties.values():
            statsRow = self.genWdPropertyStatistic(wdProperty, itemCount)
            lod.append(statsRow)
        return lod
Esempio n. 27
0
class EntityManager(YamlAbleMixin, JsonAbleMixin):
    '''
    generic entity manager
    '''
    def __init__(self,
                 name,
                 entityName,
                 entityPluralName,
                 config=None,
                 debug=False):
        '''
        Constructor
        
        Args:
            name(string): name of this eventManager
            entityName(string): entityType to be managed e.g. Country
            entityPluralName(string): plural of the the entityType e.g. Countries
            config(StorageConfig): the configuration to be used if None a default configuration will be used
            debug(boolean): override debug setting when default of config is used via config=None
        '''
        self.name = name
        self.entityName = entityName
        self.entityPluralName = entityPluralName
        if config is None:
            config = StorageConfig.getDefault()
            if config.tableName is None:
                config.tableName = entityName
            if debug:
                config.debug = debug
        self.config = config
        cacheFile = self.getCacheFile(config=config, mode=config.mode)
        self.showProgress("Creating %smanager(%s) for %s using cache %s" %
                          (self.entityName, config.mode, self.name, cacheFile))
        if config.mode is StoreMode.DGRAPH:
            self.dgraph = Dgraph(debug=config.debug,
                                 host=config.host,
                                 profile=config.profile)
        elif config.mode is StoreMode.SPARQL:
            if config.endpoint is None:
                raise Exception("no endpoint set for mode sparql")
            self.endpoint = config.endpoint
            self.sparql = SPARQL(config.endpoint,
                                 debug=config.debug,
                                 profile=config.profile)
        elif config.mode is StoreMode.SQL:
            self.executeMany = False  # may be True when issues are fixed

    def storeMode(self):
        '''
        return my store mode
        '''
        return self.config.mode

    def showProgress(self, msg):
        ''' display a progress message 
            
            Args:
              msg(string): the message to display
        '''
        if self.config.withShowProgress:
            print(msg, flush=True)

    @staticmethod
    def getCachePath():
        path = os.path.dirname(__file__)
        cachedir = path + "/../cache"
        return cachedir

    def getCacheFile(self, config=None, mode=StoreMode.SQL):
        '''
        get the cache file for this event manager
        Args:
            config(StorageConfig): if None get the cache for my mode
            mode(StoreMode): the storeMode to use
        '''
        cachedir = EntityManager.getCachePath()
        if config is not None and config.cacheFile is not None:
            return config.cacheFile
        ''' get the path to the file for my cached data '''
        if mode is StoreMode.JSON:
            cachepath = "%s/%s-%s.%s" % (cachedir, self.name, "events", 'json')
        elif mode is StoreMode.SPARQL:
            cachepath = "%s %s" % ('SPARQL', config.endpoint)
        elif mode is StoreMode.SQL:
            cachepath = "%s/%s.db" % (cachedir, config.tableName)
        else:
            cachepath = "undefined cachepath for %s" % (mode)
        return cachepath

    def getSQLDB(self, cacheFile):
        '''
        get the SQL database for the given cacheFile
        
        Args:
            cacheFile(string): the file to get the SQL db from
        '''
        config = self.config
        sqldb = self.sqldb = SQLDB(cacheFile,
                                   debug=config.debug,
                                   errorDebug=config.errorDebug)
        return sqldb

    def isCached(self):
        ''' check whether there is a file containing cached 
        data for me '''
        result = False
        config = self.config
        mode = self.config.mode
        if mode is StoreMode.JSON:
            result = os.path.isfile(
                self.getCacheFile(config=self.config, mode=StoreMode.JSON))
        elif mode is StoreMode.SPARQL:
            # @FIXME - make abstract
            query = config.prefix + """
SELECT  ?source (COUNT(?source) AS ?sourcecount)
WHERE { 
   ?event cr:Event_source ?source.
}
GROUP by ?source
"""
            sourceCountList = self.sparql.queryAsListOfDicts(query)
            for sourceCount in sourceCountList:
                source = sourceCount['source']
                recordCount = sourceCount['sourcecount']
                if source == self.name and recordCount > 100:
                    result = True
        elif mode is StoreMode.SQL:
            cacheFile = self.getCacheFile(config=self.config,
                                          mode=StoreMode.SQL)
            if os.path.isfile(cacheFile):
                sqlQuery = "SELECT COUNT(*) AS count FROM %s" % config.tableName
                try:
                    sqlDB = self.getSQLDB(cacheFile)
                    countResult = sqlDB.query(sqlQuery)
                    count = countResult[0]['count']
                    result = count > 100
                except Exception as ex:
                    # e.g. sqlite3.OperationalError: no such table: Event_crossref
                    pass
        else:
            raise Exception("unsupported mode %s" % self.mode)
        return result

    def fromCache(self):
        '''
        get my entries from the cache
        
        Returns:
            the list of Dicts and as a side effect setting self.cacheFile
        '''
        if not self.isCached():
            listOfDicts = self.getListOfDicts()
            self.cacheFile = self.store(listOfDicts)
        else:
            # fromStore also sets self.cacheFile
            listOfDicts = self.fromStore()
        return listOfDicts

    def fromStore(self, cacheFile=None):
        '''
        restore me from the store
        Args:
            cacheFile(String): the cacheFile to use if None use the preconfigured Cachefile
        Returns:
            list: list of dicts or JSON entitymanager
        '''
        startTime = time.time()
        if cacheFile is None:
            cacheFile = self.getCacheFile(config=self.config,
                                          mode=self.config.mode)
        self.cacheFile = cacheFile
        self.showProgress("reading %s for %s from cache %s" %
                          (self.entityPluralName, self.name, cacheFile))
        JSONem = None
        mode = self.config.mode
        if mode is StoreMode.JSON:
            JSONem = JsonAbleMixin.readJson(cacheFile)
        elif mode is StoreMode.SPARQL:
            # @FIXME make abstract
            eventQuery = """
PREFIX cr: <http://cr.bitplan.com/>
SELECT ?eventId ?acronym ?series ?title ?year ?country ?city ?startDate ?endDate ?url ?source WHERE { 
   OPTIONAL { ?event cr:Event_eventId ?eventId. }
   OPTIONAL { ?event cr:Event_acronym ?acronym. }
   OPTIONAL { ?event cr:Event_series ?series. }
   OPTIONAL { ?event cr:Event_title ?title. }
   OPTIONAL { ?event cr:Event_year ?year.  }
   OPTIONAL { ?event cr:Event_country ?country. }
   OPTIONAL { ?event cr:Event_city ?city. }
   OPTIONAL { ?event cr:Event_startDate ?startDate. }
   OPTIONAL { ?event cr:Event_endDate ?endDate. }
   OPTIONAL { ?event cr:Event_url ?url. }
   ?event cr:Event_source ?source FILTER(?source='%s').
}
""" % self.name
            listOfDicts = self.sparql.queryAsListOfDicts(eventQuery)
        elif mode is StoreMode.SQL:
            sqlQuery = "SELECT * FROM %s" % self.config.tableName
            sqlDB = self.getSQLDB(cacheFile)
            listOfDicts = sqlDB.query(sqlQuery)
            sqlDB.close()
            pass
        else:
            raise Exception("unsupported store mode %s" % self.mode)

        if JSONem is not None:
            return JSONem
        else:
            self.showProgress("read %d %s from %s in %5.1f s" %
                              (len(listOfDicts), self.entityPluralName,
                               self.name, time.time() - startTime))
            return listOfDicts

    def store(self,
              listOfDicts,
              limit=10000000,
              batchSize=250,
              cacheFile=None,
              sampleRecordCount=1):
        ''' 
        store my entities 
        
        Args:
            listOfDicts(list): the list of dicts to store
            limit(int): maximumn number of records to store
            batchSize(int): size of batch for storing
            cacheFile(string): the name of the storage e.g path to JSON or sqlite3 file
            sampleRecordCount(int): the number of records to analyze for type information
        '''
        config = self.config
        mode = config.mode
        if mode is StoreMode.JSON:
            if cacheFile is None:
                cacheFile = self.getCacheFile(config=self.config,
                                              mode=StoreMode.JSON)
            self.showProgress("storing %d events for %s to cache %s" %
                              (len(self.events), self.name, cacheFile))
            self.writeJson(cacheFile)
        elif mode is StoreMode.DGRAPH:
            startTime = time.time()
            self.showProgress("storing %d %s for %s to %s" % (len(
                self.events), self.entityPluralName, self.name, self.mode))
            self.dgraph.addData(listOfDicts, limit=limit, batchSize=batchSize)
            self.showProgress("store for %s done after %5.1f secs" %
                              (self.name, time.time() - startTime))
        elif mode is StoreMode.SPARQL:
            startTime = time.time()
            # @ FIXME make abstract
            self.showProgress("storing %d events for %s to %s" %
                              (len(self.events), self.name, self.mode))
            entityType = "cr:Event"
            prefixes = "PREFIX cr: <http://cr.bitplan.com/>"
            primaryKey = "eventId"
            self.sparql.insertListOfDicts(listOfDicts,
                                          entityType,
                                          primaryKey,
                                          prefixes,
                                          limit=limit,
                                          batchSize=batchSize)
            self.showProgress("store for %s done after %5.1f secs" %
                              (self.name, time.time() - startTime))
        elif mode is StoreMode.SQL:
            startTime = time.time()
            if cacheFile is None:
                cacheFile = self.getCacheFile(config=self.config,
                                              mode=self.config.mode)
            sqldb = self.getSQLDB(cacheFile)
            self.showProgress("storing %d %s for %s to %s:%s" %
                              (len(listOfDicts), self.entityPluralName,
                               self.name, config.mode, cacheFile))
            entityInfo = sqldb.createTable(listOfDicts,
                                           config.tableName,
                                           "eventId",
                                           withDrop=True,
                                           sampleRecordCount=sampleRecordCount)
            self.sqldb.store(listOfDicts,
                             entityInfo,
                             executeMany=self.executeMany)
            self.showProgress("store for %s done after %5.1f secs" %
                              (self.name, time.time() - startTime))
        else:
            raise Exception("unsupported store mode %s" % self.mode)
        return cacheFile
Esempio n. 28
0
    def testQueryDocumentation(self):
        '''
        test QueryDocumentation
        '''
        show = self.debug
        #show=True
        queries = [{
            "endpoint":
            "https://query.wikidata.org/sparql",
            "prefixes": [],
            "lang":
            "sparql",
            "name":
            "Nicknames",
            "description":
            "https://stackoverflow.com/questions/70206791/sparql-i-have-individual-with-multiple-values-for-single-object-property-how",
            "title":
            "Nick names of US Presidents",
            "query":
            """SELECT ?item ?itemLabel (GROUP_CONCAT(DISTINCT ?nickName; SEPARATOR=",") as ?nickNames)
WHERE 
{
  # president
  ?item wdt:P39 wd:Q11696.
  ?item wdt:P1449 ?nickName
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} GROUP BY ?item ?itemLabel"""
        }, {
            "endpoint":
            "https://query.wikidata.org/sparql",
            "prefixes": [
                "http://www.wikidata.org/entity/",
                "http://commons.wikimedia.org/wiki/Special:FilePath/"
            ],
            "lang":
            "sparql",
            "name":
            "CAS15",
            "title":
            "15 Random substances with CAS number",
            "description":
            "Wikidata SPARQL query showing the 15 random chemical substances with their CAS Number",
            "query":
            """# List of 15 random chemical components with CAS-Number, formula and structure
# see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46
# WF 2021-08-23
SELECT ?substance ?substanceLabel ?formula ?structure ?CAS
WHERE { 
  ?substance wdt:P31 wd:Q11173.
  ?substance wdt:P231 ?CAS.
  ?substance wdt:P274 ?formula.
  ?substance wdt:P117  ?structure.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 15
"""
        }, {
            "endpoint":
            "https://query.wikidata.org/sparql",
            "prefixes": ["http://www.wikidata.org/entity/"],
            "lang":
            "sparql",
            "name":
            "CityTop10",
            "title":
            "Ten largest cities of the world",
            "description":
            "Wikidata SPARQL query showing the 10 most populated cities of the world using the million city class Q1637706 for selection",
            "query":
            """# Ten Largest cities of the world 
# WF 2021-08-23
# see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples
# see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46
SELECT DISTINCT ?city ?cityLabel ?population ?country ?countryLabel 
WHERE {
  VALUES ?cityClass { wd:Q1637706}.
  ?city wdt:P31 ?cityClass .
  ?city wdt:P1082 ?population .
  ?city wdt:P17 ?country .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
ORDER BY DESC(?population)
LIMIT 10"""
        }, {
            "endpoint":
            "https://sophox.org/sparql",
            "lang":
            "sparql",
            "prefixes": [],
            "query":
            """# count osm place type instances
# WF 2021-08-23
# see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples
# see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46
SELECT (count(?instance) as ?count) ?placeType ?placeTypeLabel
WHERE { 
  VALUES ?placeType {
    "city"
    "town"
    "village"
  }
  ?instance osmt:place ?placeType
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?placeType ?placeTypeLabel
ORDER BY ?count""",
            "name":
            "OSM place types",
            "title":
            "count OpenStreetMap place type instances",
            "description":
            """This SPARQL query 
determines the number of instances available in the OpenStreetMap for the placeTypes city,town and village
"""
        }]
        for queryMap in queries:
            endpointUrl = queryMap.pop("endpoint")
            endpoint = SPARQL(endpointUrl)
            query = Query(**queryMap)
            showYaml = False
            if showYaml:
                yamlMarkup = query.asYaml()
                print(yamlMarkup)
            try:
                qlod = endpoint.queryAsListOfDicts(query.query)
                for tablefmt in ["mediawiki", "github", "latex"]:
                    doc = query.documentQueryResult(qlod,
                                                    tablefmt=tablefmt,
                                                    floatfmt=".0f")
                    docstr = doc.asText()
                    if show:
                        print(docstr)

            except Exception as ex:
                print(f"{query.title} at {endpointUrl} failed: {ex}")
Esempio n. 29
0
 def getDBPedia(self, mode='query', debug=False):
     endpoint = "http://dbpedia.org/sparql"
     dbpedia = SPARQL(endpoint, mode=mode, debug=debug)
     return dbpedia
Esempio n. 30
0
    def fromWikiData(self,endpoint):
        '''
        get the city List from WikiData

        Args:
            endpoint(string): the url of the endpoint to be used

        Returns:
            list: and sets it as self.cityList as a side effect
        '''
        wd=SPARQL(endpoint)
        queryString="""# get a list of cities
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
# get human settlements
SELECT DISTINCT ?city ?cityLabel (max(?cityPop) as ?cityPopulation) (min (?coord) as ?cityCoord) ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita WHERE {
  # if you uncomment this line this query might run for some 3 hours on a local wikidata copy using Apache Jena
  # run for Vienna, Illinois, Vienna Austria, Paris Texas and Paris France as example only
  # VALUES ?city { wd:Q577544 wd:Q1741 wd:Q830149 wd:Q90}.
  # run for Andorra Q228 
  # VALUES ?country {wd:Q228}.
  # instance of human settlement https://www.wikidata.org/wiki/Q486972
  ?city wdt:P31/wdt:P279* wd:Q486972 .
  # label of the City
  ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en").
  # country this city belongs to
  ?city wdt:P17 ?country .
  # label for the country
  ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
  # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
  ?country wdt:P297 ?countryIsoCode.
  # population of country
  ?country wdt:P1082 ?countryPopulation.
  OPTIONAL {
     ?country wdt:P2132 ?countryGdpPerCapita.
  }
  OPTIONAL {
     # located in administrative territory
     # https://www.wikidata.org/wiki/Property:P131
     ?city wdt:P131* ?region.
     # administrative unit of first order
     ?region wdt:P31/wdt:P279* wd:Q10864048.
     ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en").
     # isocode state/province
     OPTIONAL { ?region wdt:P300 ?regionIsoCode. }
  }
  # population of city
  OPTIONAL { ?city wdt:P1082 ?cityPop.}
   # get the coordinates
  OPTIONAL { ?city wdt:P625 ?coord. }
} GROUP BY  ?city ?cityLabel  ?cityCoord ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita
#ORDER BY ?cityLabel
"""
        results=wd.query(queryString)
        self.cityList=wd.asListOfDicts(results)
        for city in self.cityList:
            city['wikidataurl']=city.pop('city')
            city['name']=city.pop('cityLabel')
            super().setNone(city,['coord','date','cityPopulation','countryPopulation','country','countryLabel','countryIsoCode','countryGDP_perCapita','region','regionLabel','regionIsoCode','ratio'])
        return self.cityList