def query(self, msg, queryString: str, limit=None) -> list: ''' get the query result Args: msg(str): the profile message to display queryString(str): the query to execute Return: list: the list of dicts with the result ''' profile = Profiler(msg, profile=self.profile) wd = SPARQL(self.endpoint) limitedQuery = queryString if limit is not None: limitedQuery = f"{queryString} LIMIT {limit}" results = wd.query(limitedQuery) lod = wd.asListOfDicts(results) for record in lod: for key in list(record.keys()): value = record[key] if isinstance(value, str): if value.startswith("http://www.wikidata.org/"): record[key] = self.getWikidataId(value) if key.lower().endswith("coord"): lat, lon = Wikidata.getCoordinateComponents(value) record["lat"] = lat record["lon"] = lon record.pop(key) profile.time(f"({len(lod)})") return lod
def getCities(self, region=None, country=None): ''' get the cities from Wikidata ''' if region is not None: values = "VALUES ?region { wd:%s }" % region if country is not None: values = "VALUES ?country { wd:%s}" % country queryString = """# get a list of cities for the given region # for geograpy3 library # see https://github.com/somnathrakshit/geograpy3/issues/15 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wd: <http://www.wikidata.org/entity/> SELECT DISTINCT ?city ?cityLabel ?geoNameId ?cityPop ?cityCoord ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita WHERE { # administrative unit of first order # example DE-NW Q1198 %s #?region wdt:P31/wdt:P279* wd:Q10864048. ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en"). # isocode state/province OPTIONAL { ?region wdt:P300 ?regionIsoCode. } # country this region belongs to ?region wdt:P17 ?country . # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country ?country wdt:P1082 ?countryPopulation. OPTIONAL { ?country wdt:P2132 ?countryGdpPerCapita. } # located in administrative territory # https://www.wikidata.org/wiki/Property:P131 ?city wdt:P131* ?region. # label of the City ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en"). # instance of human settlement https://www.wikidata.org/wiki/Q486972 ?city wdt:P31/wdt:P279* wd:Q486972 . # geoName Identifier ?city wdt:P1566 ?geoNameId. # population of city OPTIONAL { ?city wdt:P1082 ?cityPop.} # get the coordinates OPTIONAL { select (max(?coord) as ?cityCoord) where { ?city wdt:P625 ?coord. } } } ORDER BY ?cityLabel""" % values wd = SPARQL(self.endpoint) results = wd.query(queryString) cityList = wd.asListOfDicts(results) return cityList
def fromWikiData(self,endpoint): ''' get the country List from WikiData Args: endpoint(string): the url of the endpoint to be used Returns: list: and sets it as self.countryList as a side effect ''' wd=SPARQL(endpoint) queryString=""" # get a list countries with the corresponding ISO code PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wikibase: <http://wikiba.se/ontology#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX ps: <http://www.wikidata.org/prop/statement/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> SELECT ?country ?countryLabel ?shortName (MAX(?pop) as ?population) ?gdpPerCapita ?coord ?isocode WHERE { # instance of country ?country wdt:P31 wd:Q3624078. OPTIONAL { ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). } OPTIONAL { ?country p:P1813 ?shortNameStmt. # get the short name statement ?shortNameStmt ps:P1813 ?shortName # the the short name value from the statement filter (lang(?shortName) = "en") # filter for English short names only filter not exists {?shortNameStmt pq:P31 wd:Q28840786} # ignore flags (aka emojis) } OPTIONAL { # get the population # https://www.wikidata.org/wiki/Property:P1082 ?country wdt:P1082 ?pop. } OPTIONAL { # get the gross domestic product per capita ?country wdt:P2132 ?gdpPerCapita. } # get the iso countryCode { ?country wdt:P297 ?isocode }. # get the coordinate OPTIONAL { ?country wdt:P625 ?coord }. } GROUP BY ?country ?countryLabel ?shortName ?population ?gdpPerCapita ?coord ?isocode ORDER BY ?countryLabel""" results=wd.query(queryString) self.countryList=wd.asListOfDicts(results) for country in self.countryList: country['wikidataurl']=country.pop('country') country['name']=country.pop('countryLabel') super().setNone(country,['shortName','gdpPerCapita']) return self.countryList
def test_query_with_authentication(self): """tests querying an endpoint that requires authentication""" query = """SELECT * WHERE { ?proceeding dblp:publishedInSeriesVolume "2816" .}""" sparql = SPARQL("http://localhost:5820/dblp/query", method="POST") self.assertRaises(SPARQLExceptions.Unauthorized, sparql.queryAsListOfDicts, queryString=query) sparql.addAuthentication("admin", "admin") qres = sparql.queryAsListOfDicts(query) self.assertEqual(2, len(qres))
def fromRDF(self, endpoint): ''' retrieve my event list from the given SPARQL endpoint ''' # get SPARQL access to GND data print( "Retrieving %s events from SPARQL endpoint %s\n ... this might take a few minutes ..." % (self.em.title, endpoint)) starttime = time.time() gndEp = SPARQL(endpoint) queryString = """# get events with most often used columns from GND # plus acronym, topic, homepage (seldom but useful) # WF 2020-07-12 PREFIX gndi: <https://d-nb.info/gnd> PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX dc: <http://purl.org/dc/terms/> PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> SELECT ?event ?eventId ?acronym ?variant ?title ?date ?areaCode ?place ?topic ?homepage WHERE { ?event a gnd:ConferenceOrEvent. ?event gnd:gndIdentifier ?eventId. OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. } OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.} OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?title.} OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. } OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. } OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. } OPTIONAL { ?event gnd:topic ?topic. } { ?event gnd:homepage ?homepage. } } #LIMIT 10000""" results = gndEp.query(queryString) eventList = gndEp.asListOfDicts(results) print("retrieved %d events in %6.1f s" % (len(eventList), time.time() - starttime)) for rawevent in eventList: rawevent['url'] = rawevent.pop('event') fields = [ 'eventId', 'variant', 'name', 'areaCode', 'url', 'source', 'date', 'startDate', 'endDate', 'year', 'place', 'acronym', 'lookupAcronym', 'topic', 'homepage' ] self.em.setNone(rawevent, fields) dateStr = rawevent['date'] for key, value in GND.getDateRange(dateStr).items(): rawevent[key] = value event = Event() event.fromDict(rawevent) event.source = self.em.name self.em.add(event) self.em.store(sampleRecordCount=10000)
def getCityPopulations(self, profile=True): ''' get the city populations from Wikidata Args: profile(bool): if True show profiling information ''' queryString = """ # get a list of human settlements having a geoName identifier # to add to geograpy3 library # see https://github.com/somnathrakshit/geograpy3/issues/15 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wd: <http://www.wikidata.org/entity/> SELECT ?city ?cityLabel ?cityPop ?geoNameId ?country ?countryLabel ?countryIsoCode ?countryPopulation WHERE { # geoName Identifier ?city wdt:P1566 ?geoNameId. # instance of human settlement https://www.wikidata.org/wiki/Q486972 ?city wdt:P31/wdt:P279* wd:Q486972 . # population of city OPTIONAL { ?city wdt:P1082 ?cityPop.} # label of the City ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en"). # country this city belongs to ?city wdt:P17 ?country . # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country ?country wdt:P1082 ?countryPopulation. OPTIONAL { ?country wdt:P2132 ?countryGdpPerCapita. } }""" if profile: print( "getting cities with population and geoNamesId from wikidata endpoint %s" % self.endpoint) starttime = time.time() wd = SPARQL(self.endpoint) results = wd.query(queryString) cityList = wd.asListOfDicts(results) if profile: print("Found %d cities in %5.1f s" % (len(cityList), time.time() - starttime)) return cityList
def testGetItemsByLabel(self): ''' try getting items by label ''' debug = self.debug #debug=True qLabels = [ "academic conference", "scientific conference series", "whisky distillery", "human" ] for endpointConf in self.endpointConfs: try: sparql = SPARQL(endpointConf.endpoint, method=endpointConf.method) items = {} for qLabel in qLabels: items4Label = WikidataItem.getItemsByLabel(sparql, qLabel) for i, item in enumerate(items4Label): if debug: print(f"{endpointConf.name} {i+1}:{item}") items[qLabel] = items4Label[0] for qLabel in qLabels: self.assertTrue(qLabel in items) except (Exception, HTTPError) as ex: self.handleServiceUnavailable(ex, endpointConf) pass
def getLabelAndDescription(cls, sparql: SPARQL, itemId: str, lang: str = "en"): ''' get the label for the given item and language Args: itemId(str): the wikidata Q/P id lang(str): the language of the label Returns: (str,str): the label and description as a tuple ''' query = f"""# get the label for the given item {cls.getPrefixes(["rdfs","wd","schema"])} SELECT ?itemLabel ?itemDescription WHERE {{ VALUES ?item {{ wd:{itemId} }} ?item rdfs:label ?itemLabel. FILTER (LANG(?itemLabel) = "{lang}"). ?item schema:description ?itemDescription. FILTER(LANG(?itemDescription) = "{lang}") }}""" return sparql.getValues(query, ["itemLabel", "itemDescription"])
def testControlEscape(self): ''' check the control-escaped version of an UTF-8 string ''' controls = "Α\tΩ\r\n" expected = "Α\\tΩ\\r\\n" esc = SPARQL.controlEscape(controls) self.assertEqual(expected, esc)
def testStackoverflow55961615Query(self): ''' see https://stackoverflow.com/questions/55961615/how-to-integrate-wikidata-query-in-python https://stackoverflow.com/a/69771615/1497139 ''' qlod = None try: endpoint = "https://query.wikidata.org/sparql" wd = SPARQL(endpoint) queryString = """SELECT ?s ?sLabel ?item ?itemLabel ?sourceCode ?webSite ?stackexchangeTag { SERVICE wikibase:mwapi { bd:serviceParam wikibase:api "EntitySearch". bd:serviceParam wikibase:endpoint "www.wikidata.org". bd:serviceParam mwapi:search "natural language processing". bd:serviceParam mwapi:language "en". ?item wikibase:apiOutputItem mwapi:item. ?num wikibase:apiOrdinal true. } ?s wdt:P279|wdt:P31 ?item . OPTIONAL { ?s wdt:P1324 ?sourceCode. } OPTIONAL { ?s wdt:P856 ?webSite. } OPTIONAL { ?s wdt:P1482 ?stackexchangeTag. } SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" } } ORDER BY ?itemLabel ?sLabel""" qlod = wd.queryAsListOfDicts(queryString, fixNone=True) except Exception as ex: print(f"{endpoint} access failed with {ex}- could not run test") if qlod is not None: query = Query(name="EntitySearch", query=queryString, lang='sparql') debug = self.debug for tablefmt in ["github", "mediawiki", "latex"]: qdoc = query.documentQueryResult(qlod, tablefmt=tablefmt) if debug: print(qdoc)
def testIssue7(self): ''' test conversion of dates with timezone info ''' values = ["2020-01-01T00:00:00Z", "42000-01-01T00:00:00Z"] expected = [datetime.datetime(2020, 1, 1, 0, 0), None] for index, value in enumerate(values): dt = SPARQL.strToDatetime(value, debug=self.debug) self.assertEqual(expected[index], dt)
def fromWikiData(self,endpoint): ''' get the province List from WikiData Args: endpoint(string): the url of the endpoint to be used Returns: list: and sets it as self.provinceList as a side effect ''' wd=SPARQL(endpoint) queryString=""" PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wikibase: <http://wikiba.se/ontology#> SELECT ?region ?isocc ?isocode4 ?regionLabel ?population ?location WHERE { # administrative unit of first order ?region wdt:P31/wdt:P279* wd:Q10864048. OPTIONAL { ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en"). } # filter historic regions # FILTER NOT EXISTS {?region wdt:P576 ?end} # get the population # https://www.wikidata.org/wiki/Property:P1082 OPTIONAL { ?region wdt:P1082 ?population. } # # https://www.wikidata.org/wiki/Property:P297 OPTIONAL { ?region wdt:P297 ?isocc. } # isocode state/province ?region wdt:P300 ?isocode4. # https://www.wikidata.org/wiki/Property:P625 OPTIONAL { ?region wdt:P625 ?location. } } ORDER BY (?isocode4) """ results=wd.query(queryString) self.provinceList=wd.asListOfDicts(results) for province in self.provinceList: province['wikidataurl']=province.pop('region') province['name']=province.pop('regionLabel') super().setNone(province,['population','location']) return self.provinceList
def testStats(self): if not self.available(): return queries = [ Query( 'entities and usage frequency', ''' # get histogramm data of entities by # usage frequency # WF 2020-06-27 PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> SELECT ?c (COUNT(?c) AS ?count) WHERE { ?subject a ?c } GROUP BY ?c HAVING (?count >100) ORDER BY DESC(?count) '''), Query( 'relevance of fields', '''# get histogramm data of properties by # usage frequency # WF 2020-07-12 PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX dc: <http://purl.org/dc/terms/> PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> SELECT ?property (COUNT(?property) AS ?propTotal) WHERE { ?s ?property ?o . } GROUP BY ?property HAVING (?propTotal >1000) ORDER BY DESC(?propTotal)''') ] sparql = SPARQL(self.endpoint) for query in queries: listOfDicts = sparql.queryAsListOfDicts(query.query) markup = query.asWikiMarkup(listOfDicts) markup = markup.replace( "https://d-nb.info/standards/elementset/gnd", "gnd") print("=== %s ===" % query.name) print(markup)
def testStackoverflow71444069(self): ''' https://stackoverflow.com/questions/71444069/create-csv-from-result-of-a-for-google-colab/71548650#71548650 ''' from lodstorage.sparql import SPARQL from lodstorage.csv import CSV sparqlQuery = """SELECT ?org ?orgLabel WHERE { ?org wdt:P31 wd:Q4830453. #instance of organizations ?org wdt:P17 wd:Q96. #Mexico country SERVICE wikibase:label { bd:serviceParam wikibase:language "en"} }""" sparql = SPARQL("https://query.wikidata.org/sparql") qlod = sparql.queryAsListOfDicts(sparqlQuery) csv = CSV.toCSV(qlod) if self.debug: print(csv)
def __init__(self, name, entityName, entityPluralName, config=None, debug=False): ''' Constructor Args: name(string): name of this eventManager entityName(string): entityType to be managed e.g. Country entityPluralName(string): plural of the the entityType e.g. Countries config(StorageConfig): the configuration to be used if None a default configuration will be used debug(boolean): override debug setting when default of config is used via config=None ''' self.name = name self.entityName = entityName self.entityPluralName = entityPluralName if config is None: config = StorageConfig.getDefault() if config.tableName is None: config.tableName = entityName if debug: config.debug = debug self.config = config cacheFile = self.getCacheFile(config=config, mode=config.mode) self.showProgress("Creating %smanager(%s) for %s using cache %s" % (self.entityName, config.mode, self.name, cacheFile)) if config.mode is StoreMode.DGRAPH: self.dgraph = Dgraph(debug=config.debug, host=config.host, profile=config.profile) elif config.mode is StoreMode.SPARQL: if config.endpoint is None: raise Exception("no endpoint set for mode sparql") self.endpoint = config.endpoint self.sparql = SPARQL(config.endpoint, debug=config.debug, profile=config.profile) elif config.mode is StoreMode.SQL: self.executeMany = False # may be True when issues are fixed
def getRegions(self): ''' get Regions from Wikidata `try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20regions%0A%23%20for%20geograpy3%20library%0A%23%20see%20https%3A%2F%2Fgithub.com%2Fsomnathrakshit%2Fgeograpy3%2Fissues%2F15%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0ASELECT%20%3Fcountry%20%3FcountryLabel%20%3FcountryIsoCode%20%3Fregion%20%3FregionIsoCode%20%3FregionLabel%20%3Fpopulation%20%3Flocation%0AWHERE%0A%7B%0A%20%20%23%20administrative%20unit%20of%20first%20order%0A%20%20%3Fregion%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ10864048.%0A%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%3Fregion%20rdfs%3Alabel%20%3FregionLabel%20filter%20%28lang%28%3FregionLabel%29%20%3D%20%22en%22%29.%0A%20%20%7D%0A%20%20%23%20filter%20historic%20regions%0A%20%20%23%20FILTER%20NOT%20EXISTS%20%7B%3Fregion%20wdt%3AP576%20%3Fend%7D%0A%20%20%23%20get%20the%20population%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP1082%0A%20%20OPTIONAL%20%7B%20%3Fregion%20wdt%3AP1082%20%3Fpopulation.%20%7D%0A%20%20%23%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP297%0A%20%20OPTIONAL%20%7B%20%0A%20%20%20%20%3Fregion%20wdt%3AP17%20%3Fcountry.%0A%20%20%20%20%23%20label%20for%20the%20country%0A%20%20%20%20%3Fcountry%20rdfs%3Alabel%20%3FcountryLabel%20filter%20%28lang%28%3FcountryLabel%29%20%3D%20%22en%22%29.%0A%20%20%20%20%3Fcountry%20wdt%3AP297%20%3FcountryIsoCode.%20%0A%20%20%7D%0A%20%20%23%20isocode%20state%2Fprovince%0A%20%20%3Fregion%20wdt%3AP300%20%3FregionIsoCode.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP625%0A%20%20OPTIONAL%20%7B%20%3Fregion%20wdt%3AP625%20%3Flocation.%20%7D%0A%7D>`_ ''' queryString = """# get a list of regions # for geograpy3 library # see https://github.com/somnathrakshit/geograpy3/issues/15 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX wikibase: <http://wikiba.se/ontology#> SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?region (max(?regionAlpha2) as ?regionIsoCode) ?regionLabel (max(?population) as ?regionPopulation) ?location WHERE { # administrative unit of first order ?region wdt:P31/wdt:P279* wd:Q10864048. OPTIONAL { ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en"). } # filter historic regions # FILTER NOT EXISTS {?region wdt:P576 ?end} # get the population # https://www.wikidata.org/wiki/Property:P1082 OPTIONAL { ?region wdt:P1082 ?population. } # # https://www.wikidata.org/wiki/Property:P297 OPTIONAL { ?region wdt:P17 ?country. # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). ?country wdt:P297 ?countryIsoCode. } # isocode state/province ?region wdt:P300 ?regionAlpha2. # https://www.wikidata.org/wiki/Property:P625 OPTIONAL { ?region wdt:P625 ?location. } } GROUP BY ?country ?countryLabel ?countryIsoCode ?region ?regionIsoCode ?regionLabel ?location ORDER BY ?regionIsoCode""" wd = SPARQL(self.endpoint) results = wd.query(queryString) self.regionList = wd.asListOfDicts(results)
def test_SPARQL(self): ''' test SPARQL queries ''' # disable test for the time being return qm=QueryManager(lang='sparql',debug=False) self.assertEqual(4,len(qm.queriesByName)) endpoint="http://localhost:3030/cr" sparql=SPARQL(endpoint) for name,query in qm.queriesByName.items(): listOfDicts=sparql.queryAsListOfDicts(query.query) markup=query.asWikiMarkup(listOfDicts) markup=markup.replace("http://cr.bitplan.com/","https://cr.bitplan.com/index.php/Property:") print("== %s ==" % (name)) print("=== query ===") print (query.asWikiSourceMarkup()) print("=== result ===") print(markup) pass
def getCountries(self): ''' get a list of countries `try query <https://query.wikidata.org/#%23%20get%20a%20list%20of%20countries%0A%23%20for%20geograpy3%20library%0A%23%20see%20https%3A%2F%2Fgithub.com%2Fsomnathrakshit%2Fgeograpy3%2Fissues%2F15%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20p%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2F%3E%0APREFIX%20ps%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fstatement%2F%3E%0APREFIX%20pq%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fqualifier%2F%3E%0A%23%20get%20City%20details%20with%20Country%0ASELECT%20DISTINCT%20%3Fcountry%20%3FcountryLabel%20%3FcountryIsoCode%20%3FcountryPopulation%20%3FcountryGDP_perCapita%20%3Fcoord%20%20WHERE%20%7B%0A%20%20%23%20instance%20of%20City%20Country%0A%20%20%3Fcountry%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ3624078%20.%0A%20%20%23%20label%20for%20the%20country%0A%20%20%3Fcountry%20rdfs%3Alabel%20%3FcountryLabel%20filter%20%28lang%28%3FcountryLabel%29%20%3D%20%22en%22%29.%0A%20%20%23%20get%20the%20coordinates%0A%20%20%3Fcountry%20wdt%3AP625%20%3Fcoord.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP297%20ISO%203166-1%20alpha-2%20code%0A%20%20%3Fcountry%20wdt%3AP297%20%3FcountryIsoCode.%0A%20%20%23%20population%20of%20country%0A%20%20%3Fcountry%20wdt%3AP1082%20%3FcountryPopulation.%0A%20%20%23%20https%3A%2F%2Fwww.wikidata.org%2Fwiki%2FProperty%3AP2132%0A%20%20%23%20nonminal%20GDP%20per%20capita%0A%20%20%3Fcountry%20wdt%3AP2132%20%3FcountryGDP_perCapita.%0A%7D>`_ ''' queryString = """# get a list of countries # for geograpy3 library # see https://github.com/somnathrakshit/geograpy3/issues/15 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX p: <http://www.wikidata.org/prop/> PREFIX ps: <http://www.wikidata.org/prop/statement/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> # get City details with Country SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGDP_perCapita ?countryCoord WHERE { # instance of Country ?country wdt:P31/wdt:P279* wd:Q6256 . # VALUES ?country { wd:Q55}. # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # get the coordinates OPTIONAL { select (max(?coord) as ?countryCoord) where { ?country wdt:P625 ?coord. } } # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country ?country wdt:P1082 ?countryPopulation. # https://www.wikidata.org/wiki/Property:P2132 # nominal GDP per capita OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapita. } } ORDER BY ?countryIsoCode""" wd = SPARQL(self.endpoint) results = wd.query(queryString) self.countryList = wd.asListOfDicts(results)
def __init__(self, itemQid, propertyLabels: list = [], propertyIds: list = [], subclassPredicate="wdt:P31", where: str = None, endpointConf=None, lang="en", debug=False): ''' Constructor Args: itemQid(str): wikidata id of the type to analyze propertyLabels(list): a list of labels of properties to be considered propertyIds(list): a list of ids of properties to be considered subclassPredicate(str): the subclass Predicate to use where(str): extra where clause for instance selection (if any) endpoint(str): the url of the SPARQL endpoint to be used ''' self.itemQid = itemQid self.debug = debug if endpointConf is None: endpointConf = Endpoint.getDefault() self.endpointConf = endpointConf self.sparql = SPARQL(endpointConf.endpoint, method=self.endpointConf.method) self.sparql.debug = self.debug self.subclassPredicate = subclassPredicate self.where = f"\n {where}" if where is not None else "" self.lang = lang self.item = WikidataItem(itemQid, sparql=self.sparql, lang=lang) self.queryManager = TrulyTabular.getQueryManager(debug=self.debug) self.properties = WikidataProperty.getPropertiesByIds( self.sparql, propertyIds, lang) self.properties.update( WikidataProperty.getPropertiesByLabels(self.sparql, propertyLabels, lang)) self.isodate = datetime.datetime.now().isoformat() self.error = None
def testIssue20And76(self): ''' see https://github.com/WolfgangFahl/pyLoDStorage/issues/20 add fixNone option to SPARQL results (same functionality as in SQL) https://github.com/WolfgangFahl/pyLoDStorage/issues/76 SPARQL GET method support ''' endpoint = "https://query.wikidata.org/sparql" for method in ["POST", "GET"]: wd = SPARQL(endpoint, method=method) queryString = """ # Conference Series wikidata query # see https://confident.dbis.rwth-aachen.de/dblpconf/wikidata # WF 2021-01-30 SELECT ?confSeries ?short_name ?official_website WHERE { # scientific conference series (Q47258130) ?confSeries wdt:P31 wd:Q47258130. OPTIONAL { ?confSeries wdt:P1813 ?short_name . } # official website (P856) OPTIONAL { ?confSeries wdt:P856 ?official_website } } LIMIT 200 """ lod = wd.queryAsListOfDicts(queryString, fixNone=True) fields = LOD.getFields(lod) if self.debug: print(fields) for row in lod: for field in fields: self.assertTrue(field in row)
def testWikdata(self): ''' check wikidata ''' # check we have local wikidata copy: #if getpass.getuser()=="wf": # # use 2018 wikidata copy # endpoint="http://jena.zeus.bitplan.com/wikidata/" endpoint = "https://query.wikidata.org/sparql" wd = SPARQL(endpoint) queryString = """# get a list of whisky distilleries PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> SELECT ?item ?coord WHERE { # instance of whisky distillery ?item wdt:P31 wd:Q10373548. # get the coordinate ?item wdt:P625 ?coord. } """ results = wd.query(queryString) self.assertTrue(238 <= len(results))
def testSparqlQueries(self): ''' test SPARQL queries ''' show = self.debug #show=True qm = QueryManager(lang='sparql', debug=False) for name, query in qm.queriesByName.items(): if name in ["US President Nicknames"]: if show: print(f"{name}:{query}") endpoint = SPARQL(query.endpoint) try: qlod = endpoint.queryAsListOfDicts(query.query) for tablefmt in ["mediawiki", "github", "latex"]: doc = query.documentQueryResult(qlod, tablefmt=tablefmt, floatfmt=".0f") docstr = doc.asText() if show: print(docstr) except Exception as ex: print(f"{query.title} at {query.endpoint} failed: {ex}")
def getItemsByLabel(cls, sparql: SPARQL, itemLabel: str, lang: str = "en") -> list: ''' get a Wikidata items by the given label Args: sparql(SPARQL): the SPARQL endpoint to use itemLabel(str): the label of the items lang(str): the language of the label Returns: a list of potential items ''' valuesClause = f' "{itemLabel}"@{lang}\n' query = f"""# get the items that have the given label in the given language # e.g. we'll find human=Q5 as the oldest type for the label "human" first # and then the newer ones such as "race in Warcraft" {cls.getPrefixes(["rdfs","schema","xsd"])} SELECT #?itemId ?item ?itemLabel ?itemDescription WHERE {{ VALUES ?itemLabel {{ {valuesClause} }} #BIND (xsd:integer(SUBSTR(STR(?item),33)) AS ?itemId) ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. FILTER(LANG(?itemDescription)="{lang}") }} #ORDER BY ?itemId""" qLod = sparql.queryAsListOfDicts(query) items = [] for record in qLod: url = record["item"] qid = re.sub(r"http://www.wikidata.org/entity/(.*)", r"\1", url) item = WikidataItem(qid) item.url = url item.qlabel = record["itemLabel"] item.varname = Variable.validVarName(item.qlabel) item.description = record["itemDescription"] items.append(item) sortedItems = sorted(items, key=lambda item: item.qnumber) return sortedItems
def getJena(self, mode='query', debug=False, typedLiterals=False, profile=False): ''' get the jena endpoint for the given mode Args: mode(string): query or update debug(boolean): True if debug information should be output typedLiterals(boolean): True if INSERT DATA SPARQL commands should use typed literals profile(boolean): True if profile/timing information should be shown ''' endpoint = "http://localhost:3030/example" jena = SPARQL(endpoint, mode=mode, debug=debug, typedLiterals=typedLiterals, profile=profile) return jena
def testGetPropertiesById(self): ''' try getting properties by label ''' debug = self.debug #debug=True propertyIds = ["P1800"] expected = ["Wikimedia database name"] for endpointConf in self.endpointConfs: try: sparql = SPARQL(endpointConf.endpoint, method=endpointConf.method) propList = WikidataProperty.getPropertiesByIds(sparql, propertyIds, lang="en") for i, prop in enumerate(propList): if debug: print(f"{endpointConf.name} {i}:{prop}") self.assertEqual(prop, expected[i]) except (Exception, HTTPError) as ex: self.handleServiceUnavailable(ex, endpointConf) pass
class TrulyTabular(object): ''' truly tabular SPARQL/RDF analysis checks "how tabular" a query based on a list of properties of an itemclass is ''' def __init__(self, itemQid, propertyLabels: list = [], propertyIds: list = [], subclassPredicate="wdt:P31", where: str = None, endpointConf=None, lang="en", debug=False): ''' Constructor Args: itemQid(str): wikidata id of the type to analyze propertyLabels(list): a list of labels of properties to be considered propertyIds(list): a list of ids of properties to be considered subclassPredicate(str): the subclass Predicate to use where(str): extra where clause for instance selection (if any) endpoint(str): the url of the SPARQL endpoint to be used ''' self.itemQid = itemQid self.debug = debug if endpointConf is None: endpointConf = Endpoint.getDefault() self.endpointConf = endpointConf self.sparql = SPARQL(endpointConf.endpoint, method=self.endpointConf.method) self.sparql.debug = self.debug self.subclassPredicate = subclassPredicate self.where = f"\n {where}" if where is not None else "" self.lang = lang self.item = WikidataItem(itemQid, sparql=self.sparql, lang=lang) self.queryManager = TrulyTabular.getQueryManager(debug=self.debug) self.properties = WikidataProperty.getPropertiesByIds( self.sparql, propertyIds, lang) self.properties.update( WikidataProperty.getPropertiesByLabels(self.sparql, propertyLabels, lang)) self.isodate = datetime.datetime.now().isoformat() self.error = None def __str__(self): ''' Returns: str: my text representation ''' return self.asText(long=False) def count(self): ''' get my count ''' itemText = self.getItemText() query = f"""# Count all items with the given type # {itemText} {WikidataItem.getPrefixes()} SELECT (COUNT (DISTINCT ?item) AS ?count) WHERE {{ # instance of {self.item.qlabel} ?item {self.subclassPredicate} wd:{self.item.qid}.{self.where} }}""" try: count = self.sparql.getValue(query, "count") # workaround https://github.com/ad-freiburg/qlever/issues/717 count = int(count) except Exception as ex: self.error = ex count = None return count, query def asText(self, long: bool = True): ''' returns my content as a text representation Args: long(bool): True if a long format including url is wished Returns: str: a text representation of my content ''' text = self.item.asText(long) return text def getItemText(self): # leads to 405 Method not allowed in SPARQLWrapper under certain circumstances # itemText=self.asText(long=True) itemText = f"{self.itemQid}:{self.item.qlabel}" return itemText @classmethod def getQueryManager(cls, lang='sparql', name="trulytabular", debug=False): ''' get the query manager for the given language and fileName Args: lang(str): the language of the queries to extract name(str): the name of the manager containing the query specifications debug(bool): if True set debugging on ''' qYamlFileName = f"{name}.yaml" for qYamlFile in YamlPath.getPaths(qYamlFileName): if os.path.isfile(qYamlFile): qm = QueryManager(lang=lang, debug=debug, queriesPath=qYamlFile) return qm return None def generateSparqlQuery(self, genMap: dict, listSeparator: str = "⇹", naive: bool = True, lang: str = 'en') -> str: ''' generate a SPARQL Query Args: genMap(dict): a dictionary of generation items aggregates/ignores/labels listSeparator(str): the symbole to use as a list separator for GROUP_CONCAT naive(bool): if True - generate a naive straight forward SPARQL query if False generate a proper truly tabular aggregate query lang(str): the language to generate for Returns: str: the generated SPARQL Query ''' # The Wikidata item to generate the query for item = self.item # the name of this script script = Path(__file__).name # the mode of generation naiveText = "naive" if naive else "aggregate" # start with th preamble and PREFIX section # select the item and it's label sparqlQuery = f"""# truly tabular {naiveText} query for # {item.qid}:{item.qlabel} # generated by {script} version {Version.version} on {self.isodate} {WikidataItem.getPrefixes()} SELECT ?{item.itemVarname} ?{item.labelVarname}""" # loop over all properties for wdProp in self.properties.values(): if naive: sparqlQuery += f"\n ?{wdProp.valueVarname}" else: if wdProp.pid in genMap: genList = genMap[wdProp.pid] for aggregate in genList: if not aggregate in ["ignore", "label"]: distinct = "" if aggregate == "list": aggregateFunc = "GROUP_CONCAT" aggregateParam = f';SEPARATOR="{listSeparator}"' distinct = "DISTINCT " else: if aggregate == "count": distinct = "DISTINCT " aggregateFunc = aggregate.upper() aggregateParam = "" sparqlQuery += f"\n ({aggregateFunc} ({distinct}?{wdProp.valueVarname}{aggregateParam}) AS ?{wdProp.valueVarname}_{aggregate})" elif aggregate == "label": sparqlQuery += f"\n ?{wdProp.labelVarname}" elif aggregate == "ignore" and not "label" in genList: sparqlQuery += f"\n ?{wdProp.valueVarname}" sparqlQuery += f""" WHERE {{ # instanceof {item.qid}:{item.qlabel} ?{item.itemVarname} {self.subclassPredicate} wd:{item.qid}. # label ?{item.itemVarname} rdfs:label ?{item.labelVarname}. FILTER (LANG(?{item.labelVarname}) = "{lang}"). """ for wdProp in self.properties.values(): sparqlQuery += f""" # {wdProp} OPTIONAL {{ ?{item.itemVarname} wdt:{wdProp.pid} ?{wdProp.valueVarname}. """ if wdProp.pid in genMap: genList = genMap[wdProp.pid] if "label" in genList: sparqlQuery += f"""\n ?{wdProp.valueVarname} rdfs:label ?{wdProp.labelVarname}.""" sparqlQuery += f"""\n FILTER (LANG(?{wdProp.labelVarname}) = "{lang}").""" sparqlQuery += "\n }\n" # close where Clause sparqlQuery += """}\n""" # optionally add Aggregate if not naive: sparqlQuery += f"""GROUP BY ?{item.itemVarname} ?{item.labelVarname} """ for wdProp in self.properties.values(): if wdProp.pid in genMap: genList = genMap[wdProp.pid] if "label" in genList: sparqlQuery += f"\n ?{wdProp.labelVarname}" if "ignore" in genList and not "label" in genList: sparqlQuery += f"\n ?{wdProp.valueVarname}" havingCount = 0 havingDelim = " " for wdProp in self.properties.values(): if wdProp.pid in genMap: genList = genMap[wdProp.pid] if "ignore" in genList: havingCount += 1 if havingCount == 1: sparqlQuery += f"\nHAVING (" sparqlQuery += f"\n {havingDelim}COUNT(?{wdProp.valueVarname})<=1" havingDelim = "&& " if havingCount > 0: sparqlQuery += f"\n)" return sparqlQuery def mostFrequentPropertiesQuery(self, whereClause: str = None, minCount: int = 0): ''' get the most frequently used properties Args: whereClause(str): an extra WhereClause to use ''' if whereClause is None: whereClause = f"?item {self.subclassPredicate} wd:{self.itemQid}" if self.endpointConf.database != "qlever": whereClause += ";?p ?id" whereClause += "." minCountFilter = "" if minCount > 0: minCountFilter = f"\n FILTER(?count >{minCount})." itemText = self.getItemText() sparqlQuery = f"""# get the most frequently used properties for # {itemText} {WikidataItem.getPrefixes()} SELECT ?prop ?propLabel ?wbType ?count WHERE {{ {{""" if self.endpointConf.database == "qlever": sparqlQuery += f""" SELECT ?p (COUNT(DISTINCT ?item) AS ?count) WHERE {{""" else: sparqlQuery += f""" SELECT ?prop (COUNT(DISTINCT ?item) AS ?count) WHERE {{""" if self.endpointConf.database == "blazegraph": sparqlQuery += f""" hint:Query hint:optimizer "None".""" sparqlQuery += f""" {whereClause}""" if self.endpointConf.database == "qlever": sparqlQuery += f""" ?item ql:has-predicate ?p }} GROUP BY ?p }} ?prop wikibase:directClaim ?p.""" else: sparqlQuery += f""" ?prop wikibase:directClaim ?p. }} GROUP BY ?prop ?propLabel }}""" sparqlQuery += f""" ?prop rdfs:label ?propLabel. ?prop wikibase:propertyType ?wbType. FILTER(LANG(?propLabel) = "{self.lang}").{minCountFilter} }} ORDER BY DESC (?count) """ title = f"most frequently used properties for {self.item.asText(long=True)}" query = Query(name=f"mostFrequentProperties for {itemText}", query=sparqlQuery, title=title) return query def noneTabularQuery(self, wdProperty: WikidataProperty, asFrequency: bool = True): ''' get the none tabular entries for the given property Args: wdProperty(WikidataProperty): the property to analyze asFrequency(bool): if true do a frequency analysis ''' propertyLabel = wdProperty.plabel propertyId = wdProperty.pid # work around https://github.com/RDFLib/sparqlwrapper/issues/211 if "described at" in propertyLabel: propertyLabel = propertyLabel.replace("described at", "describ'd at") sparql = f"""SELECT ?item ?itemLabel (COUNT (?value) AS ?count) WHERE {{ # instance of {self.item.qlabel} ?item {self.subclassPredicate} wd:{self.itemQid}.{self.where} ?item rdfs:label ?itemLabel. FILTER (LANG(?itemLabel) = "{self.lang}"). # {propertyLabel} ?item {wdProperty.getPredicate()} ?value. }} GROUP BY ?item ?itemLabel """ if asFrequency: freqDesc = "frequencies" sparql = f"""SELECT ?count (COUNT(?count) AS ?frequency) WHERE {{{{ {sparql} }}}} GROUP BY ?count ORDER BY DESC (?frequency)""" else: freqDesc = "records" sparql = f"""{sparql} HAVING (COUNT (?value) > 1) ORDER BY DESC(?count)""" itemText = self.getItemText() sparql = f"""# Count all {itemText} items # with the given {propertyLabel}({propertyId}) https://www.wikidata.org/wiki/Property:{propertyId} {WikidataItem.getPrefixes()} """ + sparql title = f"non tabular entries for {self.item.qlabel}/{propertyLabel}:{freqDesc}" name = f"NonTabular {self.item.qlabel}/{propertyLabel}:{freqDesc}" query = Query(query=sparql, name=name, title=title) return query def noneTabular(self, wdProperty: WikidataProperty): ''' get the none tabular result for the given Wikidata property Args: wdProperty(WikidataProperty): the Wikidata property ''' query = self.noneTabularQuery(wdProperty) if self.debug: logging.info(query.query) qlod = self.sparql.queryAsListOfDicts(query.query) return qlod def addStatsColWithPercent(self, m: dict, col: str, value: Union[int, float], total: Union[int, float]): ''' add a statistics Column Args: m(dict): col(str): name of the column value: value total: total value ''' m[col] = value if total is not None and total > 0: m[f"{col}%"] = float(f"{value/total*100:.1f}") else: m[f"{col}%"] = None def genWdPropertyStatistic(self, wdProperty: WikidataProperty, itemCount: int, withQuery=True) -> dict: ''' generate a property Statistics Row for the given wikidata Property Args: wdProperty(WikidataProperty): the property to get the statistics for itemCount(int): the total number of items to check withQuery(bool): if true include the sparql query Returns: dict: a statistics row ''' ntlod = self.noneTabular(wdProperty) statsRow = {"property": wdProperty.plabel} total = 0 nttotal = 0 maxCount = 0 for record in ntlod: f = int(record["frequency"]) count = int(record["count"]) #statsRow[f"f{count}"]=f if count > 1: nttotal += f else: statsRow["1"] = f if count > maxCount: maxCount = count total += f statsRow["maxf"] = maxCount if withQuery: statsRow["queryf"] = self.noneTabularQuery(wdProperty).query statsRow["queryex"] = self.noneTabularQuery( wdProperty, asFrequency=False).query self.addStatsColWithPercent(statsRow, "total", total, itemCount) self.addStatsColWithPercent(statsRow, "non tabular", nttotal, total) return statsRow def genPropertyStatistics(self): ''' generate the property Statistics Returns: generator: a generator of statistic dict rows ''' itemCount, _itemCountQuery = self.count() for wdProperty in self.properties.values(): statsRow = self.genWdPropertyStatistic(wdProperty, itemCount) yield statsRow def getPropertyStatistics(self): ''' get the property Statistics ''' itemCount, _itemCountQuery = self.count() lod = [{"property": "∑", "total": itemCount, "total%": 100.0}] for wdProperty in self.properties.values(): statsRow = self.genWdPropertyStatistic(wdProperty, itemCount) lod.append(statsRow) return lod
class EntityManager(YamlAbleMixin, JsonAbleMixin): ''' generic entity manager ''' def __init__(self, name, entityName, entityPluralName, config=None, debug=False): ''' Constructor Args: name(string): name of this eventManager entityName(string): entityType to be managed e.g. Country entityPluralName(string): plural of the the entityType e.g. Countries config(StorageConfig): the configuration to be used if None a default configuration will be used debug(boolean): override debug setting when default of config is used via config=None ''' self.name = name self.entityName = entityName self.entityPluralName = entityPluralName if config is None: config = StorageConfig.getDefault() if config.tableName is None: config.tableName = entityName if debug: config.debug = debug self.config = config cacheFile = self.getCacheFile(config=config, mode=config.mode) self.showProgress("Creating %smanager(%s) for %s using cache %s" % (self.entityName, config.mode, self.name, cacheFile)) if config.mode is StoreMode.DGRAPH: self.dgraph = Dgraph(debug=config.debug, host=config.host, profile=config.profile) elif config.mode is StoreMode.SPARQL: if config.endpoint is None: raise Exception("no endpoint set for mode sparql") self.endpoint = config.endpoint self.sparql = SPARQL(config.endpoint, debug=config.debug, profile=config.profile) elif config.mode is StoreMode.SQL: self.executeMany = False # may be True when issues are fixed def storeMode(self): ''' return my store mode ''' return self.config.mode def showProgress(self, msg): ''' display a progress message Args: msg(string): the message to display ''' if self.config.withShowProgress: print(msg, flush=True) @staticmethod def getCachePath(): path = os.path.dirname(__file__) cachedir = path + "/../cache" return cachedir def getCacheFile(self, config=None, mode=StoreMode.SQL): ''' get the cache file for this event manager Args: config(StorageConfig): if None get the cache for my mode mode(StoreMode): the storeMode to use ''' cachedir = EntityManager.getCachePath() if config is not None and config.cacheFile is not None: return config.cacheFile ''' get the path to the file for my cached data ''' if mode is StoreMode.JSON: cachepath = "%s/%s-%s.%s" % (cachedir, self.name, "events", 'json') elif mode is StoreMode.SPARQL: cachepath = "%s %s" % ('SPARQL', config.endpoint) elif mode is StoreMode.SQL: cachepath = "%s/%s.db" % (cachedir, config.tableName) else: cachepath = "undefined cachepath for %s" % (mode) return cachepath def getSQLDB(self, cacheFile): ''' get the SQL database for the given cacheFile Args: cacheFile(string): the file to get the SQL db from ''' config = self.config sqldb = self.sqldb = SQLDB(cacheFile, debug=config.debug, errorDebug=config.errorDebug) return sqldb def isCached(self): ''' check whether there is a file containing cached data for me ''' result = False config = self.config mode = self.config.mode if mode is StoreMode.JSON: result = os.path.isfile( self.getCacheFile(config=self.config, mode=StoreMode.JSON)) elif mode is StoreMode.SPARQL: # @FIXME - make abstract query = config.prefix + """ SELECT ?source (COUNT(?source) AS ?sourcecount) WHERE { ?event cr:Event_source ?source. } GROUP by ?source """ sourceCountList = self.sparql.queryAsListOfDicts(query) for sourceCount in sourceCountList: source = sourceCount['source'] recordCount = sourceCount['sourcecount'] if source == self.name and recordCount > 100: result = True elif mode is StoreMode.SQL: cacheFile = self.getCacheFile(config=self.config, mode=StoreMode.SQL) if os.path.isfile(cacheFile): sqlQuery = "SELECT COUNT(*) AS count FROM %s" % config.tableName try: sqlDB = self.getSQLDB(cacheFile) countResult = sqlDB.query(sqlQuery) count = countResult[0]['count'] result = count > 100 except Exception as ex: # e.g. sqlite3.OperationalError: no such table: Event_crossref pass else: raise Exception("unsupported mode %s" % self.mode) return result def fromCache(self): ''' get my entries from the cache Returns: the list of Dicts and as a side effect setting self.cacheFile ''' if not self.isCached(): listOfDicts = self.getListOfDicts() self.cacheFile = self.store(listOfDicts) else: # fromStore also sets self.cacheFile listOfDicts = self.fromStore() return listOfDicts def fromStore(self, cacheFile=None): ''' restore me from the store Args: cacheFile(String): the cacheFile to use if None use the preconfigured Cachefile Returns: list: list of dicts or JSON entitymanager ''' startTime = time.time() if cacheFile is None: cacheFile = self.getCacheFile(config=self.config, mode=self.config.mode) self.cacheFile = cacheFile self.showProgress("reading %s for %s from cache %s" % (self.entityPluralName, self.name, cacheFile)) JSONem = None mode = self.config.mode if mode is StoreMode.JSON: JSONem = JsonAbleMixin.readJson(cacheFile) elif mode is StoreMode.SPARQL: # @FIXME make abstract eventQuery = """ PREFIX cr: <http://cr.bitplan.com/> SELECT ?eventId ?acronym ?series ?title ?year ?country ?city ?startDate ?endDate ?url ?source WHERE { OPTIONAL { ?event cr:Event_eventId ?eventId. } OPTIONAL { ?event cr:Event_acronym ?acronym. } OPTIONAL { ?event cr:Event_series ?series. } OPTIONAL { ?event cr:Event_title ?title. } OPTIONAL { ?event cr:Event_year ?year. } OPTIONAL { ?event cr:Event_country ?country. } OPTIONAL { ?event cr:Event_city ?city. } OPTIONAL { ?event cr:Event_startDate ?startDate. } OPTIONAL { ?event cr:Event_endDate ?endDate. } OPTIONAL { ?event cr:Event_url ?url. } ?event cr:Event_source ?source FILTER(?source='%s'). } """ % self.name listOfDicts = self.sparql.queryAsListOfDicts(eventQuery) elif mode is StoreMode.SQL: sqlQuery = "SELECT * FROM %s" % self.config.tableName sqlDB = self.getSQLDB(cacheFile) listOfDicts = sqlDB.query(sqlQuery) sqlDB.close() pass else: raise Exception("unsupported store mode %s" % self.mode) if JSONem is not None: return JSONem else: self.showProgress("read %d %s from %s in %5.1f s" % (len(listOfDicts), self.entityPluralName, self.name, time.time() - startTime)) return listOfDicts def store(self, listOfDicts, limit=10000000, batchSize=250, cacheFile=None, sampleRecordCount=1): ''' store my entities Args: listOfDicts(list): the list of dicts to store limit(int): maximumn number of records to store batchSize(int): size of batch for storing cacheFile(string): the name of the storage e.g path to JSON or sqlite3 file sampleRecordCount(int): the number of records to analyze for type information ''' config = self.config mode = config.mode if mode is StoreMode.JSON: if cacheFile is None: cacheFile = self.getCacheFile(config=self.config, mode=StoreMode.JSON) self.showProgress("storing %d events for %s to cache %s" % (len(self.events), self.name, cacheFile)) self.writeJson(cacheFile) elif mode is StoreMode.DGRAPH: startTime = time.time() self.showProgress("storing %d %s for %s to %s" % (len( self.events), self.entityPluralName, self.name, self.mode)) self.dgraph.addData(listOfDicts, limit=limit, batchSize=batchSize) self.showProgress("store for %s done after %5.1f secs" % (self.name, time.time() - startTime)) elif mode is StoreMode.SPARQL: startTime = time.time() # @ FIXME make abstract self.showProgress("storing %d events for %s to %s" % (len(self.events), self.name, self.mode)) entityType = "cr:Event" prefixes = "PREFIX cr: <http://cr.bitplan.com/>" primaryKey = "eventId" self.sparql.insertListOfDicts(listOfDicts, entityType, primaryKey, prefixes, limit=limit, batchSize=batchSize) self.showProgress("store for %s done after %5.1f secs" % (self.name, time.time() - startTime)) elif mode is StoreMode.SQL: startTime = time.time() if cacheFile is None: cacheFile = self.getCacheFile(config=self.config, mode=self.config.mode) sqldb = self.getSQLDB(cacheFile) self.showProgress("storing %d %s for %s to %s:%s" % (len(listOfDicts), self.entityPluralName, self.name, config.mode, cacheFile)) entityInfo = sqldb.createTable(listOfDicts, config.tableName, "eventId", withDrop=True, sampleRecordCount=sampleRecordCount) self.sqldb.store(listOfDicts, entityInfo, executeMany=self.executeMany) self.showProgress("store for %s done after %5.1f secs" % (self.name, time.time() - startTime)) else: raise Exception("unsupported store mode %s" % self.mode) return cacheFile
def testQueryDocumentation(self): ''' test QueryDocumentation ''' show = self.debug #show=True queries = [{ "endpoint": "https://query.wikidata.org/sparql", "prefixes": [], "lang": "sparql", "name": "Nicknames", "description": "https://stackoverflow.com/questions/70206791/sparql-i-have-individual-with-multiple-values-for-single-object-property-how", "title": "Nick names of US Presidents", "query": """SELECT ?item ?itemLabel (GROUP_CONCAT(DISTINCT ?nickName; SEPARATOR=",") as ?nickNames) WHERE { # president ?item wdt:P39 wd:Q11696. ?item wdt:P1449 ?nickName SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } GROUP BY ?item ?itemLabel""" }, { "endpoint": "https://query.wikidata.org/sparql", "prefixes": [ "http://www.wikidata.org/entity/", "http://commons.wikimedia.org/wiki/Special:FilePath/" ], "lang": "sparql", "name": "CAS15", "title": "15 Random substances with CAS number", "description": "Wikidata SPARQL query showing the 15 random chemical substances with their CAS Number", "query": """# List of 15 random chemical components with CAS-Number, formula and structure # see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46 # WF 2021-08-23 SELECT ?substance ?substanceLabel ?formula ?structure ?CAS WHERE { ?substance wdt:P31 wd:Q11173. ?substance wdt:P231 ?CAS. ?substance wdt:P274 ?formula. ?substance wdt:P117 ?structure. SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } } LIMIT 15 """ }, { "endpoint": "https://query.wikidata.org/sparql", "prefixes": ["http://www.wikidata.org/entity/"], "lang": "sparql", "name": "CityTop10", "title": "Ten largest cities of the world", "description": "Wikidata SPARQL query showing the 10 most populated cities of the world using the million city class Q1637706 for selection", "query": """# Ten Largest cities of the world # WF 2021-08-23 # see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples # see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46 SELECT DISTINCT ?city ?cityLabel ?population ?country ?countryLabel WHERE { VALUES ?cityClass { wd:Q1637706}. ?city wdt:P31 ?cityClass . ?city wdt:P1082 ?population . ?city wdt:P17 ?country . SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } ORDER BY DESC(?population) LIMIT 10""" }, { "endpoint": "https://sophox.org/sparql", "lang": "sparql", "prefixes": [], "query": """# count osm place type instances # WF 2021-08-23 # see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples # see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46 SELECT (count(?instance) as ?count) ?placeType ?placeTypeLabel WHERE { VALUES ?placeType { "city" "town" "village" } ?instance osmt:place ?placeType SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } } GROUP BY ?placeType ?placeTypeLabel ORDER BY ?count""", "name": "OSM place types", "title": "count OpenStreetMap place type instances", "description": """This SPARQL query determines the number of instances available in the OpenStreetMap for the placeTypes city,town and village """ }] for queryMap in queries: endpointUrl = queryMap.pop("endpoint") endpoint = SPARQL(endpointUrl) query = Query(**queryMap) showYaml = False if showYaml: yamlMarkup = query.asYaml() print(yamlMarkup) try: qlod = endpoint.queryAsListOfDicts(query.query) for tablefmt in ["mediawiki", "github", "latex"]: doc = query.documentQueryResult(qlod, tablefmt=tablefmt, floatfmt=".0f") docstr = doc.asText() if show: print(docstr) except Exception as ex: print(f"{query.title} at {endpointUrl} failed: {ex}")
def getDBPedia(self, mode='query', debug=False): endpoint = "http://dbpedia.org/sparql" dbpedia = SPARQL(endpoint, mode=mode, debug=debug) return dbpedia
def fromWikiData(self,endpoint): ''' get the city List from WikiData Args: endpoint(string): the url of the endpoint to be used Returns: list: and sets it as self.cityList as a side effect ''' wd=SPARQL(endpoint) queryString="""# get a list of cities # for geograpy3 library # see https://github.com/somnathrakshit/geograpy3/issues/15 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX p: <http://www.wikidata.org/prop/> PREFIX ps: <http://www.wikidata.org/prop/statement/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> # get human settlements SELECT DISTINCT ?city ?cityLabel (max(?cityPop) as ?cityPopulation) (min (?coord) as ?cityCoord) ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita WHERE { # if you uncomment this line this query might run for some 3 hours on a local wikidata copy using Apache Jena # run for Vienna, Illinois, Vienna Austria, Paris Texas and Paris France as example only # VALUES ?city { wd:Q577544 wd:Q1741 wd:Q830149 wd:Q90}. # run for Andorra Q228 # VALUES ?country {wd:Q228}. # instance of human settlement https://www.wikidata.org/wiki/Q486972 ?city wdt:P31/wdt:P279* wd:Q486972 . # label of the City ?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en"). # country this city belongs to ?city wdt:P17 ?country . # label for the country ?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en"). # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code ?country wdt:P297 ?countryIsoCode. # population of country ?country wdt:P1082 ?countryPopulation. OPTIONAL { ?country wdt:P2132 ?countryGdpPerCapita. } OPTIONAL { # located in administrative territory # https://www.wikidata.org/wiki/Property:P131 ?city wdt:P131* ?region. # administrative unit of first order ?region wdt:P31/wdt:P279* wd:Q10864048. ?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en"). # isocode state/province OPTIONAL { ?region wdt:P300 ?regionIsoCode. } } # population of city OPTIONAL { ?city wdt:P1082 ?cityPop.} # get the coordinates OPTIONAL { ?city wdt:P625 ?coord. } } GROUP BY ?city ?cityLabel ?cityCoord ?region ?regionLabel ?regionIsoCode ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGdpPerCapita #ORDER BY ?cityLabel """ results=wd.query(queryString) self.cityList=wd.asListOfDicts(results) for city in self.cityList: city['wikidataurl']=city.pop('city') city['name']=city.pop('cityLabel') super().setNone(city,['coord','date','cityPopulation','countryPopulation','country','countryLabel','countryIsoCode','countryGDP_perCapita','region','regionLabel','regionIsoCode','ratio']) return self.cityList