Exemple #1
0
 def test_query_with_authentication(self):
     """tests querying an endpoint that requires authentication"""
     query = """SELECT * WHERE { ?proceeding dblp:publishedInSeriesVolume "2816" .}"""
     sparql = SPARQL("http://localhost:5820/dblp/query", method="POST")
     self.assertRaises(SPARQLExceptions.Unauthorized,
                       sparql.queryAsListOfDicts,
                       queryString=query)
     sparql.addAuthentication("admin", "admin")
     qres = sparql.queryAsListOfDicts(query)
     self.assertEqual(2, len(qres))
Exemple #2
0
    def getItemsByLabel(cls,
                        sparql: SPARQL,
                        itemLabel: str,
                        lang: str = "en") -> list:
        '''
        get a Wikidata items by the given label
        
        Args:
            sparql(SPARQL): the SPARQL endpoint to use
            itemLabel(str): the label of the items
            lang(str): the language of the label
            
        Returns:
            a list of potential items
        '''
        valuesClause = f'   "{itemLabel}"@{lang}\n'
        query = f"""# get the items that have the given label in the given language
# e.g. we'll find human=Q5 as the oldest type for the label "human" first
# and then the newer ones such as "race in Warcraft"
{cls.getPrefixes(["rdfs","schema","xsd"])}
SELECT 
  #?itemId 
  ?item 
  ?itemLabel 
  ?itemDescription
WHERE {{ 
  VALUES ?itemLabel {{
    {valuesClause}
  }}
  #BIND (xsd:integer(SUBSTR(STR(?item),33)) AS ?itemId)
  ?item rdfs:label ?itemLabel. 
  ?item schema:description ?itemDescription.
  FILTER(LANG(?itemDescription)="{lang}")
}} 
#ORDER BY ?itemId"""
        qLod = sparql.queryAsListOfDicts(query)
        items = []
        for record in qLod:
            url = record["item"]
            qid = re.sub(r"http://www.wikidata.org/entity/(.*)", r"\1", url)
            item = WikidataItem(qid)
            item.url = url
            item.qlabel = record["itemLabel"]
            item.varname = Variable.validVarName(item.qlabel)
            item.description = record["itemDescription"]
            items.append(item)
        sortedItems = sorted(items, key=lambda item: item.qnumber)
        return sortedItems
Exemple #3
0
    def testStackoverflow55961615Query(self):
        '''
        see 
        https://stackoverflow.com/questions/55961615/how-to-integrate-wikidata-query-in-python
        https://stackoverflow.com/a/69771615/1497139
        '''
        qlod = None
        try:
            endpoint = "https://query.wikidata.org/sparql"
            wd = SPARQL(endpoint)
            queryString = """SELECT ?s ?sLabel ?item ?itemLabel ?sourceCode ?webSite ?stackexchangeTag  {
    SERVICE wikibase:mwapi {
        bd:serviceParam wikibase:api "EntitySearch".
        bd:serviceParam wikibase:endpoint "www.wikidata.org".
        bd:serviceParam mwapi:search "natural language processing".
        bd:serviceParam mwapi:language "en".
        ?item wikibase:apiOutputItem mwapi:item.
        ?num wikibase:apiOrdinal true.
    }
    ?s wdt:P279|wdt:P31 ?item .
    OPTIONAL { 
      ?s wdt:P1324 ?sourceCode.
    }
    OPTIONAL {    
      ?s wdt:P856 ?webSite.
    }
    OPTIONAL {    
      ?s wdt:P1482 ?stackexchangeTag.
    }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
}
ORDER BY ?itemLabel ?sLabel"""
            qlod = wd.queryAsListOfDicts(queryString, fixNone=True)
        except Exception as ex:
            print(f"{endpoint} access failed with {ex}- could not run test")

        if qlod is not None:
            query = Query(name="EntitySearch",
                          query=queryString,
                          lang='sparql')
            debug = self.debug
            for tablefmt in ["github", "mediawiki", "latex"]:
                qdoc = query.documentQueryResult(qlod, tablefmt=tablefmt)
                if debug:
                    print(qdoc)
    def testStats(self):
        if not self.available():
            return
        queries = [
            Query(
                'entities and usage frequency', '''
# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>

SELECT ?c  (COUNT(?c) AS ?count)
WHERE {
  ?subject a  ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
        '''),
            Query(
                'relevance of fields',
                '''# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)''')
        ]
        sparql = SPARQL(self.endpoint)
        for query in queries:
            listOfDicts = sparql.queryAsListOfDicts(query.query)
            markup = query.asWikiMarkup(listOfDicts)
            markup = markup.replace(
                "https://d-nb.info/standards/elementset/gnd", "gnd")
            print("=== %s ===" % query.name)
            print(markup)
Exemple #5
0
    def testStackoverflow71444069(self):
        '''
        https://stackoverflow.com/questions/71444069/create-csv-from-result-of-a-for-google-colab/71548650#71548650
        '''
        from lodstorage.sparql import SPARQL
        from lodstorage.csv import CSV
        sparqlQuery = """SELECT ?org ?orgLabel
WHERE
{
  ?org wdt:P31 wd:Q4830453. #instance of organizations
  ?org wdt:P17 wd:Q96. #Mexico country

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en"}
}"""
        sparql = SPARQL("https://query.wikidata.org/sparql")
        qlod = sparql.queryAsListOfDicts(sparqlQuery)
        csv = CSV.toCSV(qlod)
        if self.debug:
            print(csv)
Exemple #6
0
 def test_SPARQL(self):
     '''
     test SPARQL queries
     '''
     # disable test for the time being
     return
     qm=QueryManager(lang='sparql',debug=False)
     self.assertEqual(4,len(qm.queriesByName))
     endpoint="http://localhost:3030/cr"
     sparql=SPARQL(endpoint)
     for name,query in qm.queriesByName.items():
         listOfDicts=sparql.queryAsListOfDicts(query.query)
         markup=query.asWikiMarkup(listOfDicts)
         markup=markup.replace("http://cr.bitplan.com/","https://cr.bitplan.com/index.php/Property:")
         print("== %s ==" % (name))
         print("=== query ===")
         print (query.asWikiSourceMarkup())
         print("=== result ===")
         print(markup)
     pass
Exemple #7
0
    def testIssue20And76(self):
        '''
        see https://github.com/WolfgangFahl/pyLoDStorage/issues/20
        add fixNone option to SPARQL results (same functionality as in SQL)
        
         https://github.com/WolfgangFahl/pyLoDStorage/issues/76
        SPARQL GET method support
        '''
        endpoint = "https://query.wikidata.org/sparql"
        for method in ["POST", "GET"]:
            wd = SPARQL(endpoint, method=method)
            queryString = """
        # Conference Series wikidata query
# see https://confident.dbis.rwth-aachen.de/dblpconf/wikidata
# WF 2021-01-30
SELECT ?confSeries ?short_name ?official_website
WHERE 
{
  #  scientific conference series (Q47258130) 
  ?confSeries wdt:P31 wd:Q47258130.
  OPTIONAL { 
    ?confSeries wdt:P1813 ?short_name . 
  }
  #  official website (P856) 
  OPTIONAL {
    ?confSeries wdt:P856 ?official_website
  } 
}
LIMIT 200
"""
            lod = wd.queryAsListOfDicts(queryString, fixNone=True)
            fields = LOD.getFields(lod)
            if self.debug:
                print(fields)
            for row in lod:
                for field in fields:
                    self.assertTrue(field in row)
Exemple #8
0
    def testSparqlQueries(self):
        '''
        test SPARQL queries 
        '''
        show = self.debug
        #show=True
        qm = QueryManager(lang='sparql', debug=False)
        for name, query in qm.queriesByName.items():
            if name in ["US President Nicknames"]:
                if show:
                    print(f"{name}:{query}")
                endpoint = SPARQL(query.endpoint)
                try:
                    qlod = endpoint.queryAsListOfDicts(query.query)
                    for tablefmt in ["mediawiki", "github", "latex"]:
                        doc = query.documentQueryResult(qlod,
                                                        tablefmt=tablefmt,
                                                        floatfmt=".0f")
                        docstr = doc.asText()
                        if show:
                            print(docstr)

                except Exception as ex:
                    print(f"{query.title} at {query.endpoint} failed: {ex}")
class EntityManager(YamlAbleMixin, JsonAbleMixin):
    '''
    generic entity manager
    '''
    def __init__(self,
                 name,
                 entityName,
                 entityPluralName,
                 config=None,
                 debug=False):
        '''
        Constructor
        
        Args:
            name(string): name of this eventManager
            entityName(string): entityType to be managed e.g. Country
            entityPluralName(string): plural of the the entityType e.g. Countries
            config(StorageConfig): the configuration to be used if None a default configuration will be used
            debug(boolean): override debug setting when default of config is used via config=None
        '''
        self.name = name
        self.entityName = entityName
        self.entityPluralName = entityPluralName
        if config is None:
            config = StorageConfig.getDefault()
            if config.tableName is None:
                config.tableName = entityName
            if debug:
                config.debug = debug
        self.config = config
        cacheFile = self.getCacheFile(config=config, mode=config.mode)
        self.showProgress("Creating %smanager(%s) for %s using cache %s" %
                          (self.entityName, config.mode, self.name, cacheFile))
        if config.mode is StoreMode.DGRAPH:
            self.dgraph = Dgraph(debug=config.debug,
                                 host=config.host,
                                 profile=config.profile)
        elif config.mode is StoreMode.SPARQL:
            if config.endpoint is None:
                raise Exception("no endpoint set for mode sparql")
            self.endpoint = config.endpoint
            self.sparql = SPARQL(config.endpoint,
                                 debug=config.debug,
                                 profile=config.profile)
        elif config.mode is StoreMode.SQL:
            self.executeMany = False  # may be True when issues are fixed

    def storeMode(self):
        '''
        return my store mode
        '''
        return self.config.mode

    def showProgress(self, msg):
        ''' display a progress message 
            
            Args:
              msg(string): the message to display
        '''
        if self.config.withShowProgress:
            print(msg, flush=True)

    @staticmethod
    def getCachePath():
        path = os.path.dirname(__file__)
        cachedir = path + "/../cache"
        return cachedir

    def getCacheFile(self, config=None, mode=StoreMode.SQL):
        '''
        get the cache file for this event manager
        Args:
            config(StorageConfig): if None get the cache for my mode
            mode(StoreMode): the storeMode to use
        '''
        cachedir = EntityManager.getCachePath()
        if config is not None and config.cacheFile is not None:
            return config.cacheFile
        ''' get the path to the file for my cached data '''
        if mode is StoreMode.JSON:
            cachepath = "%s/%s-%s.%s" % (cachedir, self.name, "events", 'json')
        elif mode is StoreMode.SPARQL:
            cachepath = "%s %s" % ('SPARQL', config.endpoint)
        elif mode is StoreMode.SQL:
            cachepath = "%s/%s.db" % (cachedir, config.tableName)
        else:
            cachepath = "undefined cachepath for %s" % (mode)
        return cachepath

    def getSQLDB(self, cacheFile):
        '''
        get the SQL database for the given cacheFile
        
        Args:
            cacheFile(string): the file to get the SQL db from
        '''
        config = self.config
        sqldb = self.sqldb = SQLDB(cacheFile,
                                   debug=config.debug,
                                   errorDebug=config.errorDebug)
        return sqldb

    def isCached(self):
        ''' check whether there is a file containing cached 
        data for me '''
        result = False
        config = self.config
        mode = self.config.mode
        if mode is StoreMode.JSON:
            result = os.path.isfile(
                self.getCacheFile(config=self.config, mode=StoreMode.JSON))
        elif mode is StoreMode.SPARQL:
            # @FIXME - make abstract
            query = config.prefix + """
SELECT  ?source (COUNT(?source) AS ?sourcecount)
WHERE { 
   ?event cr:Event_source ?source.
}
GROUP by ?source
"""
            sourceCountList = self.sparql.queryAsListOfDicts(query)
            for sourceCount in sourceCountList:
                source = sourceCount['source']
                recordCount = sourceCount['sourcecount']
                if source == self.name and recordCount > 100:
                    result = True
        elif mode is StoreMode.SQL:
            cacheFile = self.getCacheFile(config=self.config,
                                          mode=StoreMode.SQL)
            if os.path.isfile(cacheFile):
                sqlQuery = "SELECT COUNT(*) AS count FROM %s" % config.tableName
                try:
                    sqlDB = self.getSQLDB(cacheFile)
                    countResult = sqlDB.query(sqlQuery)
                    count = countResult[0]['count']
                    result = count > 100
                except Exception as ex:
                    # e.g. sqlite3.OperationalError: no such table: Event_crossref
                    pass
        else:
            raise Exception("unsupported mode %s" % self.mode)
        return result

    def fromCache(self):
        '''
        get my entries from the cache
        
        Returns:
            the list of Dicts and as a side effect setting self.cacheFile
        '''
        if not self.isCached():
            listOfDicts = self.getListOfDicts()
            self.cacheFile = self.store(listOfDicts)
        else:
            # fromStore also sets self.cacheFile
            listOfDicts = self.fromStore()
        return listOfDicts

    def fromStore(self, cacheFile=None):
        '''
        restore me from the store
        Args:
            cacheFile(String): the cacheFile to use if None use the preconfigured Cachefile
        Returns:
            list: list of dicts or JSON entitymanager
        '''
        startTime = time.time()
        if cacheFile is None:
            cacheFile = self.getCacheFile(config=self.config,
                                          mode=self.config.mode)
        self.cacheFile = cacheFile
        self.showProgress("reading %s for %s from cache %s" %
                          (self.entityPluralName, self.name, cacheFile))
        JSONem = None
        mode = self.config.mode
        if mode is StoreMode.JSON:
            JSONem = JsonAbleMixin.readJson(cacheFile)
        elif mode is StoreMode.SPARQL:
            # @FIXME make abstract
            eventQuery = """
PREFIX cr: <http://cr.bitplan.com/>
SELECT ?eventId ?acronym ?series ?title ?year ?country ?city ?startDate ?endDate ?url ?source WHERE { 
   OPTIONAL { ?event cr:Event_eventId ?eventId. }
   OPTIONAL { ?event cr:Event_acronym ?acronym. }
   OPTIONAL { ?event cr:Event_series ?series. }
   OPTIONAL { ?event cr:Event_title ?title. }
   OPTIONAL { ?event cr:Event_year ?year.  }
   OPTIONAL { ?event cr:Event_country ?country. }
   OPTIONAL { ?event cr:Event_city ?city. }
   OPTIONAL { ?event cr:Event_startDate ?startDate. }
   OPTIONAL { ?event cr:Event_endDate ?endDate. }
   OPTIONAL { ?event cr:Event_url ?url. }
   ?event cr:Event_source ?source FILTER(?source='%s').
}
""" % self.name
            listOfDicts = self.sparql.queryAsListOfDicts(eventQuery)
        elif mode is StoreMode.SQL:
            sqlQuery = "SELECT * FROM %s" % self.config.tableName
            sqlDB = self.getSQLDB(cacheFile)
            listOfDicts = sqlDB.query(sqlQuery)
            sqlDB.close()
            pass
        else:
            raise Exception("unsupported store mode %s" % self.mode)

        if JSONem is not None:
            return JSONem
        else:
            self.showProgress("read %d %s from %s in %5.1f s" %
                              (len(listOfDicts), self.entityPluralName,
                               self.name, time.time() - startTime))
            return listOfDicts

    def store(self,
              listOfDicts,
              limit=10000000,
              batchSize=250,
              cacheFile=None,
              sampleRecordCount=1):
        ''' 
        store my entities 
        
        Args:
            listOfDicts(list): the list of dicts to store
            limit(int): maximumn number of records to store
            batchSize(int): size of batch for storing
            cacheFile(string): the name of the storage e.g path to JSON or sqlite3 file
            sampleRecordCount(int): the number of records to analyze for type information
        '''
        config = self.config
        mode = config.mode
        if mode is StoreMode.JSON:
            if cacheFile is None:
                cacheFile = self.getCacheFile(config=self.config,
                                              mode=StoreMode.JSON)
            self.showProgress("storing %d events for %s to cache %s" %
                              (len(self.events), self.name, cacheFile))
            self.writeJson(cacheFile)
        elif mode is StoreMode.DGRAPH:
            startTime = time.time()
            self.showProgress("storing %d %s for %s to %s" % (len(
                self.events), self.entityPluralName, self.name, self.mode))
            self.dgraph.addData(listOfDicts, limit=limit, batchSize=batchSize)
            self.showProgress("store for %s done after %5.1f secs" %
                              (self.name, time.time() - startTime))
        elif mode is StoreMode.SPARQL:
            startTime = time.time()
            # @ FIXME make abstract
            self.showProgress("storing %d events for %s to %s" %
                              (len(self.events), self.name, self.mode))
            entityType = "cr:Event"
            prefixes = "PREFIX cr: <http://cr.bitplan.com/>"
            primaryKey = "eventId"
            self.sparql.insertListOfDicts(listOfDicts,
                                          entityType,
                                          primaryKey,
                                          prefixes,
                                          limit=limit,
                                          batchSize=batchSize)
            self.showProgress("store for %s done after %5.1f secs" %
                              (self.name, time.time() - startTime))
        elif mode is StoreMode.SQL:
            startTime = time.time()
            if cacheFile is None:
                cacheFile = self.getCacheFile(config=self.config,
                                              mode=self.config.mode)
            sqldb = self.getSQLDB(cacheFile)
            self.showProgress("storing %d %s for %s to %s:%s" %
                              (len(listOfDicts), self.entityPluralName,
                               self.name, config.mode, cacheFile))
            entityInfo = sqldb.createTable(listOfDicts,
                                           config.tableName,
                                           "eventId",
                                           withDrop=True,
                                           sampleRecordCount=sampleRecordCount)
            self.sqldb.store(listOfDicts,
                             entityInfo,
                             executeMany=self.executeMany)
            self.showProgress("store for %s done after %5.1f secs" %
                              (self.name, time.time() - startTime))
        else:
            raise Exception("unsupported store mode %s" % self.mode)
        return cacheFile
Exemple #10
0
class TrulyTabular(object):
    '''
    truly tabular SPARQL/RDF analysis
    
    checks "how tabular" a query based on a list of properties of an itemclass is
    '''
    def __init__(self,
                 itemQid,
                 propertyLabels: list = [],
                 propertyIds: list = [],
                 subclassPredicate="wdt:P31",
                 where: str = None,
                 endpointConf=None,
                 lang="en",
                 debug=False):
        '''
        Constructor
        
        Args:
            itemQid(str): wikidata id of the type to analyze 
            propertyLabels(list): a list of labels of properties to be considered
            propertyIds(list): a list of ids of properties to be considered
            subclassPredicate(str): the subclass Predicate to use
            where(str): extra where clause for instance selection (if any)
            endpoint(str): the url of the SPARQL endpoint to be used
        '''
        self.itemQid = itemQid
        self.debug = debug
        if endpointConf is None:
            endpointConf = Endpoint.getDefault()
        self.endpointConf = endpointConf
        self.sparql = SPARQL(endpointConf.endpoint,
                             method=self.endpointConf.method)
        self.sparql.debug = self.debug
        self.subclassPredicate = subclassPredicate
        self.where = f"\n  {where}" if where is not None else ""
        self.lang = lang
        self.item = WikidataItem(itemQid, sparql=self.sparql, lang=lang)
        self.queryManager = TrulyTabular.getQueryManager(debug=self.debug)
        self.properties = WikidataProperty.getPropertiesByIds(
            self.sparql, propertyIds, lang)
        self.properties.update(
            WikidataProperty.getPropertiesByLabels(self.sparql, propertyLabels,
                                                   lang))
        self.isodate = datetime.datetime.now().isoformat()
        self.error = None

    def __str__(self):
        '''
        Returns:
            str: my text representation
        '''
        return self.asText(long=False)

    def count(self):
        '''
        get my count
        '''
        itemText = self.getItemText()
        query = f"""# Count all items with the given type
# {itemText}
{WikidataItem.getPrefixes()}
SELECT (COUNT (DISTINCT ?item) AS ?count)
WHERE
{{
  # instance of {self.item.qlabel}
  ?item {self.subclassPredicate} wd:{self.item.qid}.{self.where}
}}"""
        try:
            count = self.sparql.getValue(query, "count")
            # workaround https://github.com/ad-freiburg/qlever/issues/717
            count = int(count)
        except Exception as ex:
            self.error = ex
            count = None

        return count, query

    def asText(self, long: bool = True):
        '''
        returns my content as a text representation
        
        Args:
            long(bool): True if a long format including url is wished
            
        Returns:
            str: a text representation of my content
        '''
        text = self.item.asText(long)
        return text

    def getItemText(self):
        # leads to 405 Method not allowed in SPARQLWrapper under certain circumstances
        # itemText=self.asText(long=True)
        itemText = f"{self.itemQid}:{self.item.qlabel}"
        return itemText

    @classmethod
    def getQueryManager(cls, lang='sparql', name="trulytabular", debug=False):
        '''
        get the query manager for the given language and fileName
        
        Args:
            lang(str): the language of the queries to extract
            name(str): the name of the manager containing the query specifications
            debug(bool): if True set debugging on
        '''
        qYamlFileName = f"{name}.yaml"
        for qYamlFile in YamlPath.getPaths(qYamlFileName):
            if os.path.isfile(qYamlFile):
                qm = QueryManager(lang=lang,
                                  debug=debug,
                                  queriesPath=qYamlFile)
                return qm
        return None

    def generateSparqlQuery(self,
                            genMap: dict,
                            listSeparator: str = "⇹",
                            naive: bool = True,
                            lang: str = 'en') -> str:
        '''
        generate a SPARQL Query
        
        Args:
            genMap(dict): a dictionary of generation items aggregates/ignores/labels
            listSeparator(str): the symbole to use as a list separator for GROUP_CONCAT
            naive(bool): if True - generate a naive straight forward SPARQL query
                if False generate a proper truly tabular aggregate query
            lang(str): the language to generate for
            
        Returns:
            str: the generated SPARQL Query
        '''
        # The Wikidata item to generate the query for
        item = self.item
        # the name of this script
        script = Path(__file__).name
        # the mode of generation
        naiveText = "naive" if naive else "aggregate"
        # start with th preamble and PREFIX section
        # select the item and it's label
        sparqlQuery = f"""# truly tabular {naiveText} query for 
# {item.qid}:{item.qlabel}
# generated by {script} version {Version.version} on {self.isodate}
{WikidataItem.getPrefixes()}
SELECT ?{item.itemVarname} ?{item.labelVarname}"""
        # loop over all properties
        for wdProp in self.properties.values():
            if naive:
                sparqlQuery += f"\n  ?{wdProp.valueVarname}"
            else:
                if wdProp.pid in genMap:
                    genList = genMap[wdProp.pid]
                    for aggregate in genList:
                        if not aggregate in ["ignore", "label"]:
                            distinct = ""
                            if aggregate == "list":
                                aggregateFunc = "GROUP_CONCAT"
                                aggregateParam = f';SEPARATOR="{listSeparator}"'
                                distinct = "DISTINCT "
                            else:
                                if aggregate == "count":
                                    distinct = "DISTINCT "
                                aggregateFunc = aggregate.upper()
                                aggregateParam = ""
                            sparqlQuery += f"\n  ({aggregateFunc} ({distinct}?{wdProp.valueVarname}{aggregateParam}) AS ?{wdProp.valueVarname}_{aggregate})"
                        elif aggregate == "label":
                            sparqlQuery += f"\n  ?{wdProp.labelVarname}"
                        elif aggregate == "ignore" and not "label" in genList:
                            sparqlQuery += f"\n  ?{wdProp.valueVarname}"
        sparqlQuery += f"""
WHERE {{
  # instanceof {item.qid}:{item.qlabel}
  ?{item.itemVarname} {self.subclassPredicate} wd:{item.qid}.
  # label
  ?{item.itemVarname} rdfs:label ?{item.labelVarname}.  
  FILTER (LANG(?{item.labelVarname}) = "{lang}").
"""
        for wdProp in self.properties.values():
            sparqlQuery += f"""  # {wdProp}
  OPTIONAL {{ 
    ?{item.itemVarname} wdt:{wdProp.pid} ?{wdProp.valueVarname}. """
            if wdProp.pid in genMap:
                genList = genMap[wdProp.pid]
                if "label" in genList:
                    sparqlQuery += f"""\n    ?{wdProp.valueVarname} rdfs:label ?{wdProp.labelVarname}."""
                    sparqlQuery += f"""\n    FILTER (LANG(?{wdProp.labelVarname}) = "{lang}")."""
            sparqlQuery += "\n  }\n"
        # close where Clause
        sparqlQuery += """}\n"""
        # optionally add Aggregate
        if not naive:
            sparqlQuery += f"""GROUP BY
  ?{item.itemVarname} 
  ?{item.labelVarname}
"""
            for wdProp in self.properties.values():
                if wdProp.pid in genMap:
                    genList = genMap[wdProp.pid]
                    if "label" in genList:
                        sparqlQuery += f"\n  ?{wdProp.labelVarname}"
                    if "ignore" in genList and not "label" in genList:
                        sparqlQuery += f"\n  ?{wdProp.valueVarname}"
            havingCount = 0
            havingDelim = "   "
            for wdProp in self.properties.values():
                if wdProp.pid in genMap:
                    genList = genMap[wdProp.pid]
                    if "ignore" in genList:
                        havingCount += 1
                        if havingCount == 1:
                            sparqlQuery += f"\nHAVING ("

                        sparqlQuery += f"\n  {havingDelim}COUNT(?{wdProp.valueVarname})<=1"
                        havingDelim = "&& "
            if havingCount > 0:
                sparqlQuery += f"\n)"
        return sparqlQuery

    def mostFrequentPropertiesQuery(self,
                                    whereClause: str = None,
                                    minCount: int = 0):
        '''
        get the most frequently used properties
        
        Args:
            whereClause(str): an extra WhereClause to use
        '''
        if whereClause is None:
            whereClause = f"?item {self.subclassPredicate} wd:{self.itemQid}"
            if self.endpointConf.database != "qlever":
                whereClause += ";?p ?id"
        whereClause += "."
        minCountFilter = ""
        if minCount > 0:
            minCountFilter = f"\n  FILTER(?count >{minCount})."
        itemText = self.getItemText()
        sparqlQuery = f"""# get the most frequently used properties for
# {itemText}
{WikidataItem.getPrefixes()}
SELECT ?prop ?propLabel ?wbType ?count WHERE {{
  {{"""
        if self.endpointConf.database == "qlever":
            sparqlQuery += f"""
    SELECT ?p (COUNT(DISTINCT ?item) AS ?count) WHERE {{"""
        else:
            sparqlQuery += f"""
    SELECT ?prop (COUNT(DISTINCT ?item) AS ?count) WHERE {{"""
        if self.endpointConf.database == "blazegraph":
            sparqlQuery += f"""
      hint:Query hint:optimizer "None"."""
        sparqlQuery += f"""
      {whereClause}"""
        if self.endpointConf.database == "qlever":
            sparqlQuery += f"""  
      ?item ql:has-predicate ?p 
    }} GROUP BY ?p
  }}
  ?prop wikibase:directClaim ?p."""
        else:
            sparqlQuery += f"""
      ?prop wikibase:directClaim ?p.
    }}
    GROUP BY ?prop ?propLabel
  }}"""
        sparqlQuery += f"""
  ?prop rdfs:label ?propLabel.
  ?prop wikibase:propertyType ?wbType.
  FILTER(LANG(?propLabel) = "{self.lang}").{minCountFilter}  
}}
ORDER BY DESC (?count)
"""
        title = f"most frequently used properties for {self.item.asText(long=True)}"
        query = Query(name=f"mostFrequentProperties for {itemText}",
                      query=sparqlQuery,
                      title=title)
        return query

    def noneTabularQuery(self,
                         wdProperty: WikidataProperty,
                         asFrequency: bool = True):
        '''
        get the none tabular entries for the given property
        
        Args:
            wdProperty(WikidataProperty): the property to analyze
            asFrequency(bool): if true do a frequency analysis
        '''
        propertyLabel = wdProperty.plabel
        propertyId = wdProperty.pid
        # work around https://github.com/RDFLib/sparqlwrapper/issues/211
        if "described at" in propertyLabel:
            propertyLabel = propertyLabel.replace("described at",
                                                  "describ'd at")
        sparql = f"""SELECT ?item ?itemLabel (COUNT (?value) AS ?count)
WHERE
{{
  # instance of {self.item.qlabel}
  ?item {self.subclassPredicate} wd:{self.itemQid}.{self.where}
  ?item rdfs:label ?itemLabel.
  FILTER (LANG(?itemLabel) = "{self.lang}").
  # {propertyLabel}
  ?item {wdProperty.getPredicate()} ?value.
}} GROUP BY ?item ?itemLabel
"""
        if asFrequency:
            freqDesc = "frequencies"
            sparql = f"""SELECT ?count (COUNT(?count) AS ?frequency) WHERE {{{{
{sparql}
}}}}
GROUP BY ?count
ORDER BY DESC (?frequency)"""
        else:
            freqDesc = "records"
            sparql = f"""{sparql}
HAVING (COUNT (?value) > 1)
ORDER BY DESC(?count)"""
        itemText = self.getItemText()
        sparql = f"""# Count all {itemText} items
# with the given {propertyLabel}({propertyId}) https://www.wikidata.org/wiki/Property:{propertyId} 
{WikidataItem.getPrefixes()}
""" + sparql
        title = f"non tabular entries for {self.item.qlabel}/{propertyLabel}:{freqDesc}"
        name = f"NonTabular {self.item.qlabel}/{propertyLabel}:{freqDesc}"
        query = Query(query=sparql, name=name, title=title)
        return query

    def noneTabular(self, wdProperty: WikidataProperty):
        '''
        get the none tabular result for the given Wikidata property
        
        Args:
            wdProperty(WikidataProperty): the Wikidata property
        '''
        query = self.noneTabularQuery(wdProperty)
        if self.debug:
            logging.info(query.query)
        qlod = self.sparql.queryAsListOfDicts(query.query)
        return qlod

    def addStatsColWithPercent(self, m: dict, col: str,
                               value: Union[int, float], total: Union[int,
                                                                      float]):
        '''
        add a statistics Column
        Args:
            m(dict):
            col(str): name of the column
            value: value
            total: total value
        '''
        m[col] = value
        if total is not None and total > 0:
            m[f"{col}%"] = float(f"{value/total*100:.1f}")
        else:
            m[f"{col}%"] = None

    def genWdPropertyStatistic(self,
                               wdProperty: WikidataProperty,
                               itemCount: int,
                               withQuery=True) -> dict:
        '''
        generate a property Statistics Row for the given wikidata Property
        
        Args:
            wdProperty(WikidataProperty): the property to get the statistics for
            itemCount(int): the total number of items to check
            withQuery(bool): if true include the sparql query
            
        Returns:
            dict: a statistics row
        '''
        ntlod = self.noneTabular(wdProperty)
        statsRow = {"property": wdProperty.plabel}
        total = 0
        nttotal = 0
        maxCount = 0
        for record in ntlod:
            f = int(record["frequency"])
            count = int(record["count"])
            #statsRow[f"f{count}"]=f
            if count > 1:
                nttotal += f
            else:
                statsRow["1"] = f
            if count > maxCount:
                maxCount = count
            total += f
        statsRow["maxf"] = maxCount
        if withQuery:
            statsRow["queryf"] = self.noneTabularQuery(wdProperty).query
            statsRow["queryex"] = self.noneTabularQuery(
                wdProperty, asFrequency=False).query
        self.addStatsColWithPercent(statsRow, "total", total, itemCount)
        self.addStatsColWithPercent(statsRow, "non tabular", nttotal, total)
        return statsRow

    def genPropertyStatistics(self):
        '''
        generate the property Statistics
        
        Returns:
            generator: a generator of statistic dict rows
        '''
        itemCount, _itemCountQuery = self.count()
        for wdProperty in self.properties.values():
            statsRow = self.genWdPropertyStatistic(wdProperty, itemCount)
            yield statsRow

    def getPropertyStatistics(self):
        '''
        get the property Statistics
        '''
        itemCount, _itemCountQuery = self.count()
        lod = [{"property": "∑", "total": itemCount, "total%": 100.0}]
        for wdProperty in self.properties.values():
            statsRow = self.genWdPropertyStatistic(wdProperty, itemCount)
            lod.append(statsRow)
        return lod
Exemple #11
0
    def testQueryDocumentation(self):
        '''
        test QueryDocumentation
        '''
        show = self.debug
        #show=True
        queries = [{
            "endpoint":
            "https://query.wikidata.org/sparql",
            "prefixes": [],
            "lang":
            "sparql",
            "name":
            "Nicknames",
            "description":
            "https://stackoverflow.com/questions/70206791/sparql-i-have-individual-with-multiple-values-for-single-object-property-how",
            "title":
            "Nick names of US Presidents",
            "query":
            """SELECT ?item ?itemLabel (GROUP_CONCAT(DISTINCT ?nickName; SEPARATOR=",") as ?nickNames)
WHERE 
{
  # president
  ?item wdt:P39 wd:Q11696.
  ?item wdt:P1449 ?nickName
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} GROUP BY ?item ?itemLabel"""
        }, {
            "endpoint":
            "https://query.wikidata.org/sparql",
            "prefixes": [
                "http://www.wikidata.org/entity/",
                "http://commons.wikimedia.org/wiki/Special:FilePath/"
            ],
            "lang":
            "sparql",
            "name":
            "CAS15",
            "title":
            "15 Random substances with CAS number",
            "description":
            "Wikidata SPARQL query showing the 15 random chemical substances with their CAS Number",
            "query":
            """# List of 15 random chemical components with CAS-Number, formula and structure
# see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46
# WF 2021-08-23
SELECT ?substance ?substanceLabel ?formula ?structure ?CAS
WHERE { 
  ?substance wdt:P31 wd:Q11173.
  ?substance wdt:P231 ?CAS.
  ?substance wdt:P274 ?formula.
  ?substance wdt:P117  ?structure.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 15
"""
        }, {
            "endpoint":
            "https://query.wikidata.org/sparql",
            "prefixes": ["http://www.wikidata.org/entity/"],
            "lang":
            "sparql",
            "name":
            "CityTop10",
            "title":
            "Ten largest cities of the world",
            "description":
            "Wikidata SPARQL query showing the 10 most populated cities of the world using the million city class Q1637706 for selection",
            "query":
            """# Ten Largest cities of the world 
# WF 2021-08-23
# see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples
# see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46
SELECT DISTINCT ?city ?cityLabel ?population ?country ?countryLabel 
WHERE {
  VALUES ?cityClass { wd:Q1637706}.
  ?city wdt:P31 ?cityClass .
  ?city wdt:P1082 ?population .
  ?city wdt:P17 ?country .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
ORDER BY DESC(?population)
LIMIT 10"""
        }, {
            "endpoint":
            "https://sophox.org/sparql",
            "lang":
            "sparql",
            "prefixes": [],
            "query":
            """# count osm place type instances
# WF 2021-08-23
# see also http://wiki.bitplan.com/index.php/PyLoDStorage#Examples
# see also https://github.com/WolfgangFahl/pyLoDStorage/issues/46
SELECT (count(?instance) as ?count) ?placeType ?placeTypeLabel
WHERE { 
  VALUES ?placeType {
    "city"
    "town"
    "village"
  }
  ?instance osmt:place ?placeType
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?placeType ?placeTypeLabel
ORDER BY ?count""",
            "name":
            "OSM place types",
            "title":
            "count OpenStreetMap place type instances",
            "description":
            """This SPARQL query 
determines the number of instances available in the OpenStreetMap for the placeTypes city,town and village
"""
        }]
        for queryMap in queries:
            endpointUrl = queryMap.pop("endpoint")
            endpoint = SPARQL(endpointUrl)
            query = Query(**queryMap)
            showYaml = False
            if showYaml:
                yamlMarkup = query.asYaml()
                print(yamlMarkup)
            try:
                qlod = endpoint.queryAsListOfDicts(query.query)
                for tablefmt in ["mediawiki", "github", "latex"]:
                    doc = query.documentQueryResult(qlod,
                                                    tablefmt=tablefmt,
                                                    floatfmt=".0f")
                    docstr = doc.asText()
                    if show:
                        print(docstr)

            except Exception as ex:
                print(f"{query.title} at {endpointUrl} failed: {ex}")
    def main(cls, args):
        '''
        command line activation with parsed args
        
        Args:
            args(list): the command line arguments
        '''
        debug = args.debug
        endpoints = EndpointManager.getEndpoints(args.endpointPath)
        qm = QueryManager(lang=args.language,
                          debug=debug,
                          queriesPath=args.queriesPath)
        query = None
        queryCode = args.query
        endpointConf = None
        formats = None
        # preload ValueFormatter
        ValueFormatter.getFormats(args.formatsPath)
        if args.list:
            for name, query in qm.queriesByName.items():
                print(f"{name}:{query.title}")
        elif args.listEndpoints:
            # list endpoints
            for endpoint in endpoints.values():
                if hasattr(endpoint,
                           "lang") and endpoint.lang == args.language:
                    print(endpoint)

        elif args.queryName is not None:
            if debug or args.showQuery:
                print(f"named query {args.queryName}:")
            if args.queryName not in qm.queriesByName:
                raise Exception(f"named query {args.queryName} not available")
            query = qm.queriesByName[args.queryName]
            formats = query.formats
            queryCode = query.query
            if debug or args.showQuery:
                if hasattr(query,
                           "description") and query.description is not None:
                    print(query.description)
        if query is None:
            name = "?"
            if queryCode is None and args.queryFile is not None:
                queryFilePath = Path(args.queryFile)
                queryCode = queryFilePath.read_text()
                name = queryFilePath.stem
            query = Query(name="?", query=queryCode, lang=args.language)
        if queryCode:
            if debug or args.showQuery:
                print(f"{args.language}:\n{queryCode}")
            if args.endpointName:
                endpointConf = endpoints.get(args.endpointName)
            if args.language == "sparql":
                method = 'POST'
                if args.endpointName:
                    endPointUrl = endpointConf.endpoint
                    method = endpointConf.method
                    query.tryItUrl = endpointConf.website
                    query.database = endpointConf.database
                else:
                    endPointUrl = query.endpoint
                if args.method:
                    method = method
                sparql = SPARQL(endPointUrl, method=method)
                if args.prefixes and endpointConf is not None:
                    queryCode = f"{endpointConf.prefixes}\n{queryCode}"
                if args.raw:
                    qres = cls.rawQuery(endPointUrl,
                                        query=query.query,
                                        resultFormat=args.format,
                                        mimeType=args.mimeType)
                    print(qres)
                    return
                if "wikidata" in args.endpointName and formats is None:
                    formats = ["*:wikidata"]
                qlod = sparql.queryAsListOfDicts(queryCode)
            elif args.language == "sql":
                sqlDB = SQLDB(endpointConf.endpoint)
                qlod = sqlDB.query(queryCode)
            else:
                raise Exception(
                    f"language {args.language} not known/supported")
            if args.format is Format.csv:
                csv = CSV.toCSV(qlod)
                print(csv)
            elif args.format in [
                    Format.latex, Format.github, Format.mediawiki
            ]:
                doc = query.documentQueryResult(qlod,
                                                tablefmt=str(args.format),
                                                floatfmt=".0f")
                docstr = doc.asText()
                print(docstr)
            elif args.format in [Format.json
                                 ] or args.format is None:  # set as default
                # https://stackoverflow.com/a/36142844/1497139
                print(json.dumps(qlod, indent=2, sort_keys=True, default=str))
            elif args.format in [Format.xml]:
                lod2xml = Lod2Xml(qlod)
                xml = lod2xml.asXml()
                print(xml)

            else:
                raise Exception(f"format {args.format} not supported yet")