def test_query_with_authentication(self): """tests querying an endpoint that requires authentication""" query = """SELECT * WHERE { ?proceeding dblp:publishedInSeriesVolume "2816" .}""" sparql = SPARQL("http://localhost:5820/dblp/query", method="POST") self.assertRaises(SPARQLExceptions.Unauthorized, sparql.queryAsListOfDicts, queryString=query) sparql.addAuthentication("admin", "admin") qres = sparql.queryAsListOfDicts(query) self.assertEqual(2, len(qres))
def getItemsByLabel(cls, sparql: SPARQL, itemLabel: str, lang: str = "en") -> list: ''' get a Wikidata items by the given label Args: sparql(SPARQL): the SPARQL endpoint to use itemLabel(str): the label of the items lang(str): the language of the label Returns: a list of potential items ''' valuesClause = f' "{itemLabel}"@{lang}\n' query = f"""# get the items that have the given label in the given language # e.g. we'll find human=Q5 as the oldest type for the label "human" first # and then the newer ones such as "race in Warcraft" {cls.getPrefixes(["rdfs","schema","xsd"])} SELECT #?itemId ?item ?itemLabel ?itemDescription WHERE {{ VALUES ?itemLabel {{ {valuesClause} }} #BIND (xsd:integer(SUBSTR(STR(?item),33)) AS ?itemId) ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. FILTER(LANG(?itemDescription)="{lang}") }} #ORDER BY ?itemId""" qLod = sparql.queryAsListOfDicts(query) items = [] for record in qLod: url = record["item"] qid = re.sub(r"http://www.wikidata.org/entity/(.*)", r"\1", url) item = WikidataItem(qid) item.url = url item.qlabel = record["itemLabel"] item.varname = Variable.validVarName(item.qlabel) item.description = record["itemDescription"] items.append(item) sortedItems = sorted(items, key=lambda item: item.qnumber) return sortedItems
def testStackoverflow55961615Query(self): ''' see https://stackoverflow.com/questions/55961615/how-to-integrate-wikidata-query-in-python https://stackoverflow.com/a/69771615/1497139 ''' qlod = None try: endpoint = "https://query.wikidata.org/sparql" wd = SPARQL(endpoint) queryString = """SELECT ?s ?sLabel ?item ?itemLabel ?sourceCode ?webSite ?stackexchangeTag { SERVICE wikibase:mwapi { bd:serviceParam wikibase:api "EntitySearch". bd:serviceParam wikibase:endpoint "www.wikidata.org". bd:serviceParam mwapi:search "natural language processing". bd:serviceParam mwapi:language "en". ?item wikibase:apiOutputItem mwapi:item. ?num wikibase:apiOrdinal true. } ?s wdt:P279|wdt:P31 ?item . OPTIONAL { ?s wdt:P1324 ?sourceCode. } OPTIONAL { ?s wdt:P856 ?webSite. } OPTIONAL { ?s wdt:P1482 ?stackexchangeTag. } SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" } } ORDER BY ?itemLabel ?sLabel""" qlod = wd.queryAsListOfDicts(queryString, fixNone=True) except Exception as ex: print(f"{endpoint} access failed with {ex}- could not run test") if qlod is not None: query = Query(name="EntitySearch", query=queryString, lang='sparql') debug = self.debug for tablefmt in ["github", "mediawiki", "latex"]: qdoc = query.documentQueryResult(qlod, tablefmt=tablefmt) if debug: print(qdoc)
def testStats(self): if not self.available(): return queries = [ Query( 'entities and usage frequency', ''' # get histogramm data of entities by # usage frequency # WF 2020-06-27 PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> SELECT ?c (COUNT(?c) AS ?count) WHERE { ?subject a ?c } GROUP BY ?c HAVING (?count >100) ORDER BY DESC(?count) '''), Query( 'relevance of fields', '''# get histogramm data of properties by # usage frequency # WF 2020-07-12 PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX dc: <http://purl.org/dc/terms/> PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> SELECT ?property (COUNT(?property) AS ?propTotal) WHERE { ?s ?property ?o . } GROUP BY ?property HAVING (?propTotal >1000) ORDER BY DESC(?propTotal)''') ] sparql = SPARQL(self.endpoint) for query in queries: listOfDicts = sparql.queryAsListOfDicts(query.query) markup = query.asWikiMarkup(listOfDicts) markup = markup.replace( "https://d-nb.info/standards/elementset/gnd", "gnd") print("=== %s ===" % query.name) print(markup)
def testStackoverflow71444069(self): ''' https://stackoverflow.com/questions/71444069/create-csv-from-result-of-a-for-google-colab/71548650#71548650 ''' from lodstorage.sparql import SPARQL from lodstorage.csv import CSV sparqlQuery = """SELECT ?org ?orgLabel WHERE { ?org wdt:P31 wd:Q4830453. #instance of organizations ?org wdt:P17 wd:Q96. #Mexico country SERVICE wikibase:label { bd:serviceParam wikibase:language "en"} }""" sparql = SPARQL("https://query.wikidata.org/sparql") qlod = sparql.queryAsListOfDicts(sparqlQuery) csv = CSV.toCSV(qlod) if self.debug: print(csv)
def test_SPARQL(self): ''' test SPARQL queries ''' # disable test for the time being return qm=QueryManager(lang='sparql',debug=False) self.assertEqual(4,len(qm.queriesByName)) endpoint="http://localhost:3030/cr" sparql=SPARQL(endpoint) for name,query in qm.queriesByName.items(): listOfDicts=sparql.queryAsListOfDicts(query.query) markup=query.asWikiMarkup(listOfDicts) markup=markup.replace("http://cr.bitplan.com/","https://cr.bitplan.com/index.php/Property:") print("== %s ==" % (name)) print("=== query ===") print (query.asWikiSourceMarkup()) print("=== result ===") print(markup) pass
def testIssue20And76(self): ''' see https://github.com/WolfgangFahl/pyLoDStorage/issues/20 add fixNone option to SPARQL results (same functionality as in SQL) https://github.com/WolfgangFahl/pyLoDStorage/issues/76 SPARQL GET method support ''' endpoint = "https://query.wikidata.org/sparql" for method in ["POST", "GET"]: wd = SPARQL(endpoint, method=method) queryString = """ # Conference Series wikidata query # see https://confident.dbis.rwth-aachen.de/dblpconf/wikidata # WF 2021-01-30 SELECT ?confSeries ?short_name ?official_website WHERE { # scientific conference series (Q47258130) ?confSeries wdt:P31 wd:Q47258130. OPTIONAL { ?confSeries wdt:P1813 ?short_name . } # official website (P856) OPTIONAL { ?confSeries wdt:P856 ?official_website } } LIMIT 200 """ lod = wd.queryAsListOfDicts(queryString, fixNone=True) fields = LOD.getFields(lod) if self.debug: print(fields) for row in lod: for field in fields: self.assertTrue(field in row)
def testSparqlQueries(self): ''' test SPARQL queries ''' show = self.debug #show=True qm = QueryManager(lang='sparql', debug=False) for name, query in qm.queriesByName.items(): if name in ["US President Nicknames"]: if show: print(f"{name}:{query}") endpoint = SPARQL(query.endpoint) try: qlod = endpoint.queryAsListOfDicts(query.query) for tablefmt in ["mediawiki", "github", "latex"]: doc = query.documentQueryResult(qlod, tablefmt=tablefmt, floatfmt=".0f") docstr = doc.asText() if show: print(docstr) except Exception as ex: print(f"{query.title} at {query.endpoint} failed: {ex}")
class EntityManager(YamlAbleMixin, JsonAbleMixin): ''' generic entity manager ''' def __init__(self, name, entityName, entityPluralName, config=None, debug=False): ''' Constructor Args: name(string): name of this eventManager entityName(string): entityType to be managed e.g. Country entityPluralName(string): plural of the the entityType e.g. Countries config(StorageConfig): the configuration to be used if None a default configuration will be used debug(boolean): override debug setting when default of config is used via config=None ''' self.name = name self.entityName = entityName self.entityPluralName = entityPluralName if config is None: config = StorageConfig.getDefault() if config.tableName is None: config.tableName = entityName if debug: config.debug = debug self.config = config cacheFile = self.getCacheFile(config=config, mode=config.mode) self.showProgress("Creating %smanager(%s) for %s using cache %s" % (self.entityName, config.mode, self.name, cacheFile)) if config.mode is StoreMode.DGRAPH: self.dgraph = Dgraph(debug=config.debug, host=config.host, profile=config.profile) elif config.mode is StoreMode.SPARQL: if config.endpoint is None: raise Exception("no endpoint set for mode sparql") self.endpoint = config.endpoint self.sparql = SPARQL(config.endpoint, debug=config.debug, profile=config.profile) elif config.mode is StoreMode.SQL: self.executeMany = False # may be True when issues are fixed def storeMode(self): ''' return my store mode ''' return self.config.mode def showProgress(self, msg): ''' display a progress message Args: msg(string): the message to display ''' if self.config.withShowProgress: print(msg, flush=True) @staticmethod def getCachePath(): path = os.path.dirname(__file__) cachedir = path + "/../cache" return cachedir def getCacheFile(self, config=None, mode=StoreMode.SQL): ''' get the cache file for this event manager Args: config(StorageConfig): if None get the cache for my mode mode(StoreMode): the storeMode to use ''' cachedir = EntityManager.getCachePath() if config is not None and config.cacheFile is not None: return config.cacheFile ''' get the path to the file for my cached data ''' if mode is StoreMode.JSON: cachepath = "%s/%s-%s.%s" % (cachedir, self.name, "events", 'json') elif mode is StoreMode.SPARQL: cachepath = "%s %s" % ('SPARQL', config.endpoint) elif mode is StoreMode.SQL: cachepath = "%s/%s.db" % (cachedir, config.tableName) else: cachepath = "undefined cachepath for %s" % (mode) return cachepath def getSQLDB(self, cacheFile): ''' get the SQL database for the given cacheFile Args: cacheFile(string): the file to get the SQL db from ''' config = self.config sqldb = self.sqldb = SQLDB(cacheFile, debug=config.debug, errorDebug=config.errorDebug) return sqldb def isCached(self): ''' check whether there is a file containing cached data for me ''' result = False config = self.config mode = self.config.mode if mode is StoreMode.JSON: result = os.path.isfile( self.getCacheFile(config=self.config, mode=StoreMode.JSON)) elif mode is StoreMode.SPARQL: # @FIXME - make abstract query = config.prefix + """ SELECT ?source (COUNT(?source) AS ?sourcecount) WHERE { ?event cr:Event_source ?source. } GROUP by ?source """ sourceCountList = self.sparql.queryAsListOfDicts(query) for sourceCount in sourceCountList: source = sourceCount['source'] recordCount = sourceCount['sourcecount'] if source == self.name and recordCount > 100: result = True elif mode is StoreMode.SQL: cacheFile = self.getCacheFile(config=self.config, mode=StoreMode.SQL) if os.path.isfile(cacheFile): sqlQuery = "SELECT COUNT(*) AS count FROM %s" % config.tableName try: sqlDB = self.getSQLDB(cacheFile) countResult = sqlDB.query(sqlQuery) count = countResult[0]['count'] result = count > 100 except Exception as ex: # e.g. sqlite3.OperationalError: no such table: Event_crossref pass else: raise Exception("unsupported mode %s" % self.mode) return result def fromCache(self): ''' get my entries from the cache Returns: the list of Dicts and as a side effect setting self.cacheFile ''' if not self.isCached(): listOfDicts = self.getListOfDicts() self.cacheFile = self.store(listOfDicts) else: # fromStore also sets self.cacheFile listOfDicts = self.fromStore() return listOfDicts def fromStore(self, cacheFile=None): ''' restore me from the store Args: cacheFile(String): the cacheFile to use if None use the preconfigured Cachefile Returns: list: list of dicts or JSON entitymanager ''' startTime = time.time() if cacheFile is None: cacheFile = self.getCacheFile(config=self.config, mode=self.config.mode) self.cacheFile = cacheFile self.showProgress("reading %s for %s from cache %s" % (self.entityPluralName, self.name, cacheFile)) JSONem = None mode = self.config.mode if mode is StoreMode.JSON: JSONem = JsonAbleMixin.readJson(cacheFile) elif mode is StoreMode.SPARQL: # @FIXME make abstract eventQuery = """ PREFIX cr: <http://cr.bitplan.com/> SELECT ?eventId ?acronym ?series ?title ?year ?country ?city ?startDate ?endDate ?url ?source WHERE { OPTIONAL { ?event cr:Event_eventId ?eventId. } OPTIONAL { ?event cr:Event_acronym ?acronym. } OPTIONAL { ?event cr:Event_series ?series. } OPTIONAL { ?event cr:Event_title ?title. } OPTIONAL { ?event cr:Event_year ?year. } OPTIONAL { ?event cr:Event_country ?country. } OPTIONAL { ?event cr:Event_city ?city. } OPTIONAL { ?event cr:Event_startDate ?startDate. } OPTIONAL { ?event cr:Event_endDate ?endDate. } OPTIONAL { ?event cr:Event_url ?url. } ?event cr:Event_source ?source FILTER(?source='%s'). } """ % self.name listOfDicts = self.sparql.queryAsListOfDicts(eventQuery) elif mode is StoreMode.SQL: sqlQuery = "SELECT * FROM %s" % self.config.tableName sqlDB = self.getSQLDB(cacheFile) listOfDicts = sqlDB.query(sqlQuery) sqlDB.close() pass else: raise Exception("unsupported store mode %s" % self.mode) if JSONem is not None: return JSONem else: self.showProgress("read %d %s from %s in %5.1f s" % (len(listOfDicts), self.entityPluralName, self.name, time.time() - startTime)) return listOfDicts def store(self, listOfDicts, limit=10000000, batchSize=250, cacheFile=None, sampleRecordCount=1): ''' store my entities Args: listOfDicts(list): the list of dicts to store limit(int): maximumn number of records to store batchSize(int): size of batch for storing cacheFile(string): the name of the storage e.g path to JSON or sqlite3 file sampleRecordCount(int): the number of records to analyze for type information ''' config = self.config mode = config.mode if mode is StoreMode.JSON: if cacheFile is None: cacheFile = self.getCacheFile(config=self.config, mode=StoreMode.JSON) self.showProgress("storing %d events for %s to cache %s" % (len(self.events), self.name, cacheFile)) self.writeJson(cacheFile) elif mode is StoreMode.DGRAPH: startTime = time.time() self.showProgress("storing %d %s for %s to %s" % (len( self.events), self.entityPluralName, self.name, self.mode)) self.dgraph.addData(listOfDicts, limit=limit, batchSize=batchSize) self.showProgress("store for %s done after %5.1f secs" % (self.name, time.time() - startTime)) elif mode is StoreMode.SPARQL: startTime = time.time() # @ FIXME make abstract self.showProgress("storing %d events for %s to %s" % (len(self.events), self.name, self.mode)) entityType = "cr:Event" prefixes = "PREFIX cr: <http://cr.bitplan.com/>" primaryKey = "eventId" self.sparql.insertListOfDicts(listOfDicts, entityType, primaryKey, prefixes, limit=limit, batchSize=batchSize) self.showProgress("store for %s done after %5.1f secs" % (self.name, time.time() - startTime)) elif mode is StoreMode.SQL: startTime = time.time() if cacheFile is None: cacheFile = self.getCacheFile(config=self.config, mode=self.config.mode) sqldb = self.getSQLDB(cacheFile) self.showProgress("storing %d %s for %s to %s:%s" % (len(listOfDicts), self.entityPluralName, self.name, config.mode, cacheFile)) entityInfo = sqldb.createTable(listOfDicts, config.tableName, "eventId", withDrop=True, sampleRecordCount=sampleRecordCount) self.sqldb.store(listOfDicts, entityInfo, executeMany=self.executeMany) self.showProgress("store for %s done after %5.1f secs" % (self.name, time.time() - startTime)) else: raise Exception("unsupported store mode %s" % self.mode) return cacheFile
class TrulyTabular(object): ''' truly tabular SPARQL/RDF analysis checks "how tabular" a query based on a list of properties of an itemclass is ''' def __init__(self, itemQid, propertyLabels: list = [], propertyIds: list = [], subclassPredicate="wdt:P31", where: str = None, endpointConf=None, lang="en", debug=False): ''' Constructor Args: itemQid(str): wikidata id of the type to analyze propertyLabels(list): a list of labels of properties to be considered propertyIds(list): a list of ids of properties to be considered subclassPredicate(str): the subclass Predicate to use where(str): extra where clause for instance selection (if any) endpoint(str): the url of the SPARQL endpoint to be used ''' self.itemQid = itemQid self.debug = debug if endpointConf is None: endpointConf = Endpoint.getDefault() self.endpointConf = endpointConf self.sparql = SPARQL(endpointConf.endpoint, method=self.endpointConf.method) self.sparql.debug = self.debug self.subclassPredicate = subclassPredicate self.where = f"\n {where}" if where is not None else "" self.lang = lang self.item = WikidataItem(itemQid, sparql=self.sparql, lang=lang) self.queryManager = TrulyTabular.getQueryManager(debug=self.debug) self.properties = WikidataProperty.getPropertiesByIds( self.sparql, propertyIds, lang) self.properties.update( WikidataProperty.getPropertiesByLabels(self.sparql, propertyLabels, lang)) self.isodate = datetime.datetime.now().isoformat() self.error = None def __str__(self): ''' Returns: str: my text representation ''' return self.asText(long=False) def count(self): ''' get my count ''' itemText = self.getItemText() query = f"""# Count all items with the given type # {itemText} {WikidataItem.getPrefixes()} SELECT (COUNT (DISTINCT ?item) AS ?count) WHERE {{ # instance of {self.item.qlabel} ?item {self.subclassPredicate} wd:{self.item.qid}.{self.where} }}""" try: count = self.sparql.getValue(query, "count") # workaround https://github.com/ad-freiburg/qlever/issues/717 count = int(count) except Exception as ex: self.error = ex count = None return count, query def asText(self, long: bool = True): ''' returns my content as a text representation Args: long(bool): True if a long format including url is wished Returns: str: a text representation of my content ''' text = self.item.asText(long) return text def getItemText(self): # leads to 405 Method not allowed in SPARQLWrapper under certain circumstances # itemText=self.asText(long=True) itemText = f"{self.itemQid}:{self.item.qlabel}" return itemText @classmethod def getQueryManager(cls, lang='sparql', name="trulytabular", debug=False): ''' get the query manager for the given language and fileName Args: lang(str): the language of the queries to extract name(str): the name of the manager containing the query specifications debug(bool): if True set debugging on ''' qYamlFileName = f"{name}.yaml" for qYamlFile in YamlPath.getPaths(qYamlFileName): if os.path.isfile(qYamlFile): qm = QueryManager(lang=lang, debug=debug, queriesPath=qYamlFile) return qm return None def generateSparqlQuery(self, genMap: dict, listSeparator: str = "⇹", naive: bool = True, lang: str = 'en') -> str: ''' generate a SPARQL Query Args: genMap(dict): a dictionary of generation items aggregates/ignores/labels listSeparator(str): the symbole to use as a list separator for GROUP_CONCAT naive(bool): if True - generate a naive straight forward SPARQL query if False generate a proper truly tabular aggregate query lang(str): the language to generate for Returns: str: the generated SPARQL Query ''' # The Wikidata item to generate the query for item = self.item # the name of this script script = Path(__file__).name # the mode of generation naiveText = "naive" if naive else "aggregate" # start with th preamble and PREFIX section # select the item and it's label sparqlQuery = f"""# truly tabular {naiveText} query for # {item.qid}:{item.qlabel} # generated by {script} version {Version.version} on {self.isodate} {WikidataItem.getPrefixes()} SELECT ?{item.itemVarname} ?{item.labelVarname}""" # loop over all properties for wdProp in self.properties.values(): if naive: sparqlQuery += f"\n ?{wdProp.valueVarname}" else: if wdProp.pid in genMap: genList = genMap[wdProp.pid] for aggregate in genList: if not aggregate in ["ignore", "label"]: distinct = "" if aggregate == "list": aggregateFunc = "GROUP_CONCAT" aggregateParam = f';SEPARATOR="{listSeparator}"' distinct = "DISTINCT " else: if aggregate == "count": distinct = "DISTINCT " aggregateFunc = aggregate.upper() aggregateParam = "" sparqlQuery += f"\n ({aggregateFunc} ({distinct}?{wdProp.valueVarname}{aggregateParam}) AS ?{wdProp.valueVarname}_{aggregate})" elif aggregate == "label": sparqlQuery += f"\n ?{wdProp.labelVarname}" elif aggregate == "ignore" and not "label" in genList: sparqlQuery += f"\n ?{wdProp.valueVarname}" sparqlQuery += f""" WHERE {{ # instanceof {item.qid}:{item.qlabel} ?{item.itemVarname} {self.subclassPredicate} wd:{item.qid}. # label ?{item.itemVarname} rdfs:label ?{item.labelVarname}. FILTER (LANG(?{item.labelVarname}) = "{lang}"). """ for wdProp in self.properties.values(): sparqlQuery += f""" # {wdProp} OPTIONAL {{ ?{item.itemVarname} wdt:{wdProp.pid} ?{wdProp.valueVarname}. """ if wdProp.pid in genMap: genList = genMap[wdProp.pid] if "label" in genList: sparqlQuery += f"""\n ?{wdProp.valueVarname} rdfs:label ?{wdProp.labelVarname}.""" sparqlQuery += f"""\n FILTER (LANG(?{wdProp.labelVarname}) = "{lang}").""" sparqlQuery += "\n }\n" # close where Clause sparqlQuery += """}\n""" # optionally add Aggregate if not naive: sparqlQuery += f"""GROUP BY ?{item.itemVarname} ?{item.labelVarname} """ for wdProp in self.properties.values(): if wdProp.pid in genMap: genList = genMap[wdProp.pid] if "label" in genList: sparqlQuery += f"\n ?{wdProp.labelVarname}" if "ignore" in genList and not "label" in genList: sparqlQuery += f"\n ?{wdProp.valueVarname}" havingCount = 0 havingDelim = " " for wdProp in self.properties.values(): if wdProp.pid in genMap: genList = genMap[wdProp.pid] if "ignore" in genList: havingCount += 1 if havingCount == 1: sparqlQuery += f"\nHAVING (" sparqlQuery += f"\n {havingDelim}COUNT(?{wdProp.valueVarname})<=1" havingDelim = "&& " if havingCount > 0: sparqlQuery += f"\n)" return sparqlQuery def mostFrequentPropertiesQuery(self, whereClause: str = None, minCount: int = 0): ''' get the most frequently used properties Args: whereClause(str): an extra WhereClause to use ''' if whereClause is None: whereClause = f"?item {self.subclassPredicate} wd:{self.itemQid}" if self.endpointConf.database != "qlever": whereClause += ";?p ?id" whereClause += "." minCountFilter = "" if minCount > 0: minCountFilter = f"\n FILTER(?count >{minCount})." itemText = self.getItemText() sparqlQuery = f"""# get the most frequently used properties for # {itemText} {WikidataItem.getPrefixes()} SELECT ?prop ?propLabel ?wbType ?count WHERE {{ {{""" if self.endpointConf.database == "qlever": sparqlQuery += f""" SELECT ?p (COUNT(DISTINCT ?item) AS ?count) WHERE {{""" else: sparqlQuery += f""" SELECT ?prop (COUNT(DISTINCT ?item) AS ?count) WHERE {{""" if self.endpointConf.database == "blazegraph": sparqlQuery += f""" hint:Query hint:optimizer "None".""" sparqlQuery += f""" {whereClause}""" if self.endpointConf.database == "qlever": sparqlQuery += f""" ?item ql:has-predicate ?p }} GROUP BY ?p }} ?prop wikibase:directClaim ?p.""" else: sparqlQuery += f""" ?prop wikibase:directClaim ?p. }} GROUP BY ?prop ?propLabel }}""" sparqlQuery += f""" ?prop rdfs:label ?propLabel. ?prop wikibase:propertyType ?wbType. FILTER(LANG(?propLabel) = "{self.lang}").{minCountFilter} }} ORDER BY DESC (?count) """ title = f"most frequently used properties for {self.item.asText(long=True)}" query = Query(name=f"mostFrequentProperties for {itemText}", query=sparqlQuery, title=title) return query def noneTabularQuery(self, wdProperty: WikidataProperty, asFrequency: bool = True): ''' get the none tabular entries for the given property Args: wdProperty(WikidataProperty): the property to analyze asFrequency(bool): if true do a frequency analysis ''' propertyLabel = wdProperty.plabel propertyId = wdProperty.pid # work around https://github.com/RDFLib/sparqlwrapper/issues/211 if "described at" in propertyLabel: propertyLabel = propertyLabel.replace("described at", "describ'd at") sparql = f"""SELECT ?item ?itemLabel (COUNT (?value) AS ?count) WHERE {{ # instance of {self.item.qlabel} ?item {self.subclassPredicate} wd:{self.itemQid}.{self.where} ?item rdfs:label ?itemLabel. FILTER (LANG(?itemLabel) = "{self.lang}"). # {propertyLabel} ?item {wdProperty.getPredicate()} ?value. }} GROUP BY ?item ?itemLabel """ if asFrequency: freqDesc = "frequencies" sparql = f"""SELECT ?count (COUNT(?count) AS ?frequency) WHERE {{{{ {sparql} }}}} GROUP BY ?count ORDER BY DESC (?frequency)""" else: freqDesc = "records" sparql = f"""{sparql} HAVING (COUNT (?value) > 1) ORDER BY DESC(?count)""" itemText = self.getItemText() sparql = f"""# Count all {itemText} items # with the given {propertyLabel}({propertyId}) https://www.wikidata.org/wiki/Property:{propertyId} {WikidataItem.getPrefixes()} """ + sparql title = f"non tabular entries for {self.item.qlabel}/{propertyLabel}:{freqDesc}" name = f"NonTabular {self.item.qlabel}/{propertyLabel}:{freqDesc}" query = Query(query=sparql, name=name, title=title) return query def noneTabular(self, wdProperty: WikidataProperty): ''' get the none tabular result for the given Wikidata property Args: wdProperty(WikidataProperty): the Wikidata property ''' query = self.noneTabularQuery(wdProperty) if self.debug: logging.info(query.query) qlod = self.sparql.queryAsListOfDicts(query.query) return qlod def addStatsColWithPercent(self, m: dict, col: str, value: Union[int, float], total: Union[int, float]): ''' add a statistics Column Args: m(dict): col(str): name of the column value: value total: total value ''' m[col] = value if total is not None and total > 0: m[f"{col}%"] = float(f"{value/total*100:.1f}") else: m[f"{col}%"] = None def genWdPropertyStatistic(self, wdProperty: WikidataProperty, itemCount: int, withQuery=True) -> dict: ''' generate a property Statistics Row for the given wikidata Property Args: wdProperty(WikidataProperty): the property to get the statistics for itemCount(int): the total number of items to check withQuery(bool): if true include the sparql query Returns: dict: a statistics row ''' ntlod = self.noneTabular(wdProperty) statsRow = {"property": wdProperty.plabel} total = 0 nttotal = 0 maxCount = 0 for record in ntlod: f = int(record["frequency"]) count = int(record["count"]) #statsRow[f"f{count}"]=f if count > 1: nttotal += f else: statsRow["1"] = f if count > maxCount: maxCount = count total += f statsRow["maxf"] = maxCount if withQuery: statsRow["queryf"] = self.noneTabularQuery(wdProperty).query statsRow["queryex"] = self.noneTabularQuery( wdProperty, asFrequency=False).query self.addStatsColWithPercent(statsRow, "total", total, itemCount) self.addStatsColWithPercent(statsRow, "non tabular", nttotal, total) return statsRow def genPropertyStatistics(self): ''' generate the property Statistics Returns: generator: a generator of statistic dict rows ''' itemCount, _itemCountQuery = self.count() for wdProperty in self.properties.values(): statsRow = self.genWdPropertyStatistic(wdProperty, itemCount) yield statsRow def getPropertyStatistics(self): ''' get the property Statistics ''' itemCount, _itemCountQuery = self.count() lod = [{"property": "∑", "total": itemCount, "total%": 100.0}] for wdProperty in self.properties.values(): statsRow = self.genWdPropertyStatistic(wdProperty, itemCount) lod.append(statsRow) return lod
def testQueryDocumentation(self): ''' test QueryDocumentation ''' show = self.debug #show=True queries = [{ "endpoint": "https://query.wikidata.org/sparql", "prefixes": [], "lang": "sparql", "name": "Nicknames", "description": "https://stackoverflow.com/questions/70206791/sparql-i-have-individual-with-multiple-values-for-single-object-property-how", "title": "Nick names of US Presidents", "query": """SELECT ?item ?itemLabel (GROUP_CONCAT(DISTINCT ?nickName; SEPARATOR=",") as ?nickNames) WHERE { # president ?item wdt:P39 wd:Q11696. ?item wdt:P1449 ?nickName SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } GROUP BY ?item ?itemLabel""" }, { "endpoint": "https://query.wikidata.org/sparql", "prefixes": [ "http://www.wikidata.org/entity/", "http://commons.wikimedia.org/wiki/Special:FilePath/" ], "lang": "sparql", "name": "CAS15", "title": "15 Random substances with CAS number", "description": "Wikidata SPARQL query showing the 15 random chemical substances with their CAS Number", "query": """# List of 15 random chemical components with CAS-Number, formula and structure # see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46 # WF 2021-08-23 SELECT ?substance ?substanceLabel ?formula ?structure ?CAS WHERE { ?substance wdt:P31 wd:Q11173. ?substance wdt:P231 ?CAS. ?substance wdt:P274 ?formula. ?substance wdt:P117 ?structure. SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } } LIMIT 15 """ }, { "endpoint": "https://query.wikidata.org/sparql", "prefixes": ["http://www.wikidata.org/entity/"], "lang": "sparql", "name": "CityTop10", "title": "Ten largest cities of the world", "description": "Wikidata SPARQL query showing the 10 most populated cities of the world using the million city class Q1637706 for selection", "query": """# Ten Largest cities of the world # WF 2021-08-23 # see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples # see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46 SELECT DISTINCT ?city ?cityLabel ?population ?country ?countryLabel WHERE { VALUES ?cityClass { wd:Q1637706}. ?city wdt:P31 ?cityClass . ?city wdt:P1082 ?population . ?city wdt:P17 ?country . SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } ORDER BY DESC(?population) LIMIT 10""" }, { "endpoint": "https://sophox.org/sparql", "lang": "sparql", "prefixes": [], "query": """# count osm place type instances # WF 2021-08-23 # see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples # see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46 SELECT (count(?instance) as ?count) ?placeType ?placeTypeLabel WHERE { VALUES ?placeType { "city" "town" "village" } ?instance osmt:place ?placeType SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } } GROUP BY ?placeType ?placeTypeLabel ORDER BY ?count""", "name": "OSM place types", "title": "count OpenStreetMap place type instances", "description": """This SPARQL query determines the number of instances available in the OpenStreetMap for the placeTypes city,town and village """ }] for queryMap in queries: endpointUrl = queryMap.pop("endpoint") endpoint = SPARQL(endpointUrl) query = Query(**queryMap) showYaml = False if showYaml: yamlMarkup = query.asYaml() print(yamlMarkup) try: qlod = endpoint.queryAsListOfDicts(query.query) for tablefmt in ["mediawiki", "github", "latex"]: doc = query.documentQueryResult(qlod, tablefmt=tablefmt, floatfmt=".0f") docstr = doc.asText() if show: print(docstr) except Exception as ex: print(f"{query.title} at {endpointUrl} failed: {ex}")
def main(cls, args): ''' command line activation with parsed args Args: args(list): the command line arguments ''' debug = args.debug endpoints = EndpointManager.getEndpoints(args.endpointPath) qm = QueryManager(lang=args.language, debug=debug, queriesPath=args.queriesPath) query = None queryCode = args.query endpointConf = None formats = None # preload ValueFormatter ValueFormatter.getFormats(args.formatsPath) if args.list: for name, query in qm.queriesByName.items(): print(f"{name}:{query.title}") elif args.listEndpoints: # list endpoints for endpoint in endpoints.values(): if hasattr(endpoint, "lang") and endpoint.lang == args.language: print(endpoint) elif args.queryName is not None: if debug or args.showQuery: print(f"named query {args.queryName}:") if args.queryName not in qm.queriesByName: raise Exception(f"named query {args.queryName} not available") query = qm.queriesByName[args.queryName] formats = query.formats queryCode = query.query if debug or args.showQuery: if hasattr(query, "description") and query.description is not None: print(query.description) if query is None: name = "?" if queryCode is None and args.queryFile is not None: queryFilePath = Path(args.queryFile) queryCode = queryFilePath.read_text() name = queryFilePath.stem query = Query(name="?", query=queryCode, lang=args.language) if queryCode: if debug or args.showQuery: print(f"{args.language}:\n{queryCode}") if args.endpointName: endpointConf = endpoints.get(args.endpointName) if args.language == "sparql": method = 'POST' if args.endpointName: endPointUrl = endpointConf.endpoint method = endpointConf.method query.tryItUrl = endpointConf.website query.database = endpointConf.database else: endPointUrl = query.endpoint if args.method: method = method sparql = SPARQL(endPointUrl, method=method) if args.prefixes and endpointConf is not None: queryCode = f"{endpointConf.prefixes}\n{queryCode}" if args.raw: qres = cls.rawQuery(endPointUrl, query=query.query, resultFormat=args.format, mimeType=args.mimeType) print(qres) return if "wikidata" in args.endpointName and formats is None: formats = ["*:wikidata"] qlod = sparql.queryAsListOfDicts(queryCode) elif args.language == "sql": sqlDB = SQLDB(endpointConf.endpoint) qlod = sqlDB.query(queryCode) else: raise Exception( f"language {args.language} not known/supported") if args.format is Format.csv: csv = CSV.toCSV(qlod) print(csv) elif args.format in [ Format.latex, Format.github, Format.mediawiki ]: doc = query.documentQueryResult(qlod, tablefmt=str(args.format), floatfmt=".0f") docstr = doc.asText() print(docstr) elif args.format in [Format.json ] or args.format is None: # set as default # https://stackoverflow.com/a/36142844/1497139 print(json.dumps(qlod, indent=2, sort_keys=True, default=str)) elif args.format in [Format.xml]: lod2xml = Lod2Xml(qlod) xml = lod2xml.asXml() print(xml) else: raise Exception(f"format {args.format} not supported yet")