Esempio n. 1
0
 def getSampleTableDB(withDrop=False,debug=False,failIfTooFew=False,sampleSize=1000):
     listOfRecords=Sample.getSample(sampleSize)
     sqlDB=SQLDB()
     entityName="sample"
     primaryKey='pKey'
     sampleRecordCount=sampleSize*10
     sqlDB.debug=debug
     entityInfo=sqlDB.createTable(listOfRecords, entityName, primaryKey=primaryKey, withDrop=withDrop, sampleRecordCount=sampleRecordCount,failIfTooFew=failIfTooFew)
     executeMany=True
     fixNone=True
     sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone)
     return sqlDB
Esempio n. 2
0
 def testWikidataCities(self):
     '''
     test getting city information from wikidata
     
     '''
     # Wikidata time outs in CI environment need to be avoided
     if getpass.getuser() != "wf":
         return
     config = StorageConfig.getSQL(debug=self.debug)
     config.cacheRootDir = "/tmp/wdhs"
     cachedir = config.getCachePath()
     config.cacheFile = f"{cachedir}/hs.db"
     # use 2018 wikidata copy
     # wikidata.endpoint="http://blazegraph.bitplan.com/sparql"
     # use 2020 wikidata copy
     wikidata = Wikidata()
     wikidata.endpoint = "https://confident.dbis.rwth-aachen.de/jena/wdhs/sparql"
     #wikidata.endpoint="http://jena.bitplan.com/wdhs/sparql"
     regions = [{
         "name": "Singapore",
         "country": "Q334",
         "region": None,
         "cities": 46
     }, {
         "name": "Beijing",
         "country": None,
         "region": "Q956",
         "cities": 25
     }, {
         "name": "Paris",
         "country": None,
         "region": "Q13917",
         "cities": 1242
     }, {
         "name": "Barcelona",
         "country": None,
         "region": "Q5705",
         "cities": 1242
     }, {
         "name": "Rome",
         "country": None,
         "region": "Q1282",
         "cities": 1242
     }]
     limit = 1000000  #if self.inCI() else 100
     cityList = wikidata.getCities(limit=limit)
     sqlDB = SQLDB(config.cacheFile)
     entityInfo = sqlDB.createTable(cityList, "hs", withDrop=True)
     sqlDB.store(cityList, entityInfo, fixNone=True)
     expected = 200000  # if self.inCI() else limit
     self.assertTrue(len(cityList) >= expected)
Esempio n. 3
0
 def testQueryParams(self):
     '''
     test Query Params
     '''
     listOfDicts=[
         {"city": "New York", "country": "US"},
         {"city": "Amsterdam", "country": "NL"},
         {"city": "Paris", "country": "FR"}]
     sqlDB=SQLDB(debug=self.debug,errorDebug=True)
     entityInfo=sqlDB.createTable(listOfDicts[:10],'cities','city')
     sqlDB.store(listOfDicts,entityInfo,executeMany=False)
     query="SELECT * from cities WHERE country in (?)"
     params=('FR',)
     frCities=sqlDB.query(query,params)
     if self.debug:
         print (frCities);
     self.assertEqual([{'city': 'Paris', 'country': 'FR'}],frCities)
Esempio n. 4
0
    def testUniqueConstraint(self):
        '''
        test for https://github.com/WolfgangFahl/pyLoDStorage/issues/4
        sqlite3.IntegrityError: UNIQUE constraint failed: ... show debug info
        '''
        listOfDicts=[
            {"name": "John Doe"},
            {"name": "Frank Doe"}, 
            {"name": "John Doe"}, 
            {"name":"Tim Doe"}]
        sqlDB=SQLDB(debug=self.debug,errorDebug=True)
        entityInfo=sqlDB.createTable(listOfDicts[:10],'Does','name')
        try:
            sqlDB.store(listOfDicts,entityInfo,executeMany=False)
            self.fail("There should be an exception")
        except Exception as ex:
            expected="""INSERT INTO Does (name) values (:name)
failed:UNIQUE constraint failed: Does.name
record  #3={'name': 'John Doe'}"""
            errMsg=str(ex)
            self.assertEqual(expected,errMsg)
 def testWordParser(self):
     '''
     try finding quantiles
     
     see https://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy
     '''
     lookup = Lookup("test Word parser")
     sqlDB = lookup.getSQLDB()
     if sqlDB is not None:
         totalWordUsages = []
         for source in ['wikidata', 'crossref', 'dblp', 'CEUR-WS']:
             listOfDicts = TestWordParser.getProceedingsTitles(
                 sqlDB, source)
             cwp = CorpusWordParser()
             wordusages = cwp.parse(listOfDicts)
             lens = {}
             for wordusage in wordusages:
                 totalWordUsages.append(wordusage.__dict__)
                 if wordusage.eventId in lens:
                     lens[wordusage.eventId] += 1
                 else:
                     lens[wordusage.eventId] = 1
             df = DataFrame(lens.values())
             print(df.quantile(1))
             quantileValues = df.quantile(.90)
             print(quantileValues)
             plot = Plot(lens.values(),
                         "%s wordcount histogram" % source,
                         xlabel="wordcount",
                         ylabel="frequency")
             plot.hist(mode='save')
         wordUsageDBFile = Lookup.getDBFile("wordusage")
         wSQLDB = SQLDB(wordUsageDBFile)
         entityInfo = wSQLDB.createTable(totalWordUsages,
                                         "wordusage",
                                         withDrop=True)
         wSQLDB.store(totalWordUsages, entityInfo)
Esempio n. 6
0
class TestSQLDB(Basetest):
    '''
    Test the SQLDB database wrapper
    '''
    
    def checkListOfRecords(self,listOfRecords,entityName,primaryKey=None,executeMany=True,fixNone=False,fixDates=False,debug=False,doClose=True):
        '''
        check the handling of the given list of Records
        
        Args:
          
           listOfRecords(list): a list of dicts that contain the data to be stored
           entityName(string): the name of the entity type to be used as a table name
           primaryKey(string): the name of the key / column to be used as a primary key
           executeMany(boolean): True if executeMany mode of sqlite3 should be used
           fixNone(boolean): fix dict entries that are undefined to have a "None" entry
           debug(boolean): True if debug information e.g. CREATE TABLE and INSERT INTO commands should be shown
           doClose(boolean): True if the connection should be closed
      
        '''     
        size=len(listOfRecords)
        if self.debug:
            print("%s size is %d fixNone is %r fixDates is: %r" % (entityName,size,fixNone,fixDates))
        self.sqlDB=SQLDB(debug=debug,errorDebug=True)
        entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityName,primaryKey)
        startTime=time.time()
        self.sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone)
        elapsed=time.time()-startTime
        if self.debug:
            print ("adding %d %s records took %5.3f s => %5.f records/s" % (size,entityName,elapsed,size/elapsed)) 
        resultList=self.sqlDB.queryAll(entityInfo,fixDates=fixDates)    
        if self.debug:
            print ("selecting %d %s records took %5.3f s => %5.f records/s" % (len(resultList),entityName,elapsed,len(resultList)/elapsed)) 
        if doClose:
            self.sqlDB.close()
        return resultList
    
    def testEntityInfo(self):
        '''
        test creating entityInfo from the sample record
        '''
        listOfRecords=Sample.getRoyals()
        entityInfo=EntityInfo(listOfRecords[:3],'Person','name',debug=True)
        self.assertEqual("CREATE TABLE Person(name TEXT PRIMARY KEY,born DATE,numberInLine INTEGER,wikidataurl TEXT,age FLOAT,ofAge BOOLEAN,lastmodified TIMESTAMP)",entityInfo.createTableCmd)
        self.assertEqual("INSERT INTO Person (name,born,numberInLine,wikidataurl,age,ofAge,lastmodified) values (:name,:born,:numberInLine,:wikidataurl,:age,:ofAge,:lastmodified)",entityInfo.insertCmd)
        self.sqlDB=SQLDB(debug=self.debug,errorDebug=True)
        entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityInfo.name,entityInfo.primaryKey)
        tableList=self.sqlDB.getTableList()
        if self.debug:
            print (tableList)
        self.assertEqual(1,len(tableList))
        personTable=tableList[0]
        self.assertEqual("Person",personTable['name'])
        self.assertEqual(7,len(personTable['columns']))
        uml=UML()
        plantUml=uml.tableListToPlantUml(tableList,packageName="Royals",withSkin=False)
        if self.debug:
            print(plantUml)
        expected="""package Royals {
  class Person << Entity >> {
   age : FLOAT 
   born : DATE 
   lastmodified : TIMESTAMP 
   name : TEXT <<PK>>
   numberInLine : INTEGER 
   ofAge : BOOLEAN 
   wikidataurl : TEXT 
  }
}
"""
        self.assertEqual(expected,plantUml)
        
        # testGeneralization
        listOfRecords=[{'name': 'Royal family', 'country': 'UK', 'lastmodified':datetime.now()}]
        entityInfo=self.sqlDB.createTable(listOfRecords[:10],'Family','name')
        tableList=self.sqlDB.getTableList()
        self.assertEqual(2,len(tableList))
        uml=UML()
        plantUml=uml.tableListToPlantUml(tableList,generalizeTo="PersonBase",withSkin=False)
        if self.debug:
            print(plantUml)
        expected='''class PersonBase << Entity >> {
 lastmodified : TIMESTAMP 
 name : TEXT <<PK>>
}
class Person << Entity >> {
 age : FLOAT 
 born : DATE 
 numberInLine : INTEGER 
 ofAge : BOOLEAN 
 wikidataurl : TEXT 
}
class Family << Entity >> {
 country : TEXT 
}
PersonBase <|-- Person
PersonBase <|-- Family
'''
        self.assertEqual(expected,plantUml)
        
    def testIssue15(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/15
        
        auto create view ddl in mergeschema
        
        '''
        self.sqlDB=SQLDB(debug=self.debug,errorDebug=self.debug)
        listOfRecords=Sample.getRoyals()
        entityInfo=EntityInfo(listOfRecords[:3],'Person','name',debug=self.debug)
        entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityInfo.name,entityInfo.primaryKey)
        listOfRecords=[{'name': 'Royal family', 'country': 'UK', 'lastmodified':datetime.now()}]
        entityInfo=self.sqlDB.createTable(listOfRecords[:10],'Family','name')
        tableList=self.sqlDB.getTableList()
        viewDDL=Schema.getGeneralViewDDL(tableList,"PersonBase")
        if self.debug:
            print (viewDDL)
        expected="""CREATE VIEW PersonBase AS 
  SELECT name,lastmodified FROM Person
UNION
  SELECT name,lastmodified FROM Family"""
        self.assertEqual(expected,viewDDL)
        pass
        
    def testUniqueConstraint(self):
        '''
        test for https://github.com/WolfgangFahl/pyLoDStorage/issues/4
        sqlite3.IntegrityError: UNIQUE constraint failed: ... show debug info
        '''
        listOfDicts=[
            {"name": "John Doe"},
            {"name": "Frank Doe"}, 
            {"name": "John Doe"}, 
            {"name":"Tim Doe"}]
        sqlDB=SQLDB(debug=self.debug,errorDebug=True)
        entityInfo=sqlDB.createTable(listOfDicts[:10],'Does','name')
        try:
            sqlDB.store(listOfDicts,entityInfo,executeMany=False)
            self.fail("There should be an exception")
        except Exception as ex:
            expected="""INSERT INTO Does (name) values (:name)
failed:UNIQUE constraint failed: Does.name
record  #3={'name': 'John Doe'}"""
            errMsg=str(ex)
            self.assertEqual(expected,errMsg)
            
    def testSqlite3(self):
        '''
        test sqlite3 with a few records from the royal family
        '''
        listOfRecords=Sample.getRoyals()
        resultList=self.checkListOfRecords(listOfRecords, 'Person', 'name',debug=True)
        if self.debug:
            print(resultList)
        self.assertEqual(listOfRecords,resultList)
       
    def testIssue13_setNoneValue(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/13
        set None value for undefined LoD entries
        '''
        listOfRecords=[
            { 'make': 'Ford','model':'Model T', 'color':'black'},
            { 'make': 'VW', 'model':'beetle'}
        ]
        entityName="Car"
        primaryKey="Model"
        resultList=self.checkListOfRecords(listOfRecords, entityName, primaryKey,fixNone=True)
        if self.debug:
            print (resultList)
            
    def testIssue14_execute(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/14
        
        offer execute wrapper directly via sqlDB
        '''
        sqlDB=SQLDB()
        ddl="""
        CREATE TABLE contacts (
            contact_id INTEGER PRIMARY KEY,
            first_name TEXT NOT NULL,
            last_name TEXT NOT NULL
        )
        """
        sqlDB.execute(ddl)
        tableList=sqlDB.getTableList()
        if self.debug:
            print(tableList)
        self.assertEqual(1,len(tableList))
        self.assertEqual("contacts",tableList[0]['name'])
        
    def testIssue41(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/41
        improve error message when create table command fails
        '''
        listOfRecords=[{
            'name':'value',
            'py/object': 'datetime.time'
        }]
        self.sqlDB=SQLDB(debug=self.debug,errorDebug=True)
        try:
            _entityInfo=self.sqlDB.createTable(listOfRecords[:1],'Invalid','name')
            self.fail("There should be an exception")
        except Exception as ex:
            self.assertTrue("CREATE TABLE Invalid" in str(ex))
         
    def testBindingError(self):
        '''
        test list of Records with incomplete record leading to
        "You did not supply a value for binding 2"
        see https://bugs.python.org/issue41638
        '''
        listOfRecords=[{'name':'Pikachu', 'type':'Electric'},{'name':'Raichu' }]
        for executeMany in [True,False]:
            try:
                self.checkListOfRecords(listOfRecords,'Pokemon','name',executeMany=executeMany)
                self.fail("There should be an exception")
            except Exception as ex:
                if self.debug:
                    print(str(ex))
                self.assertTrue('no value supplied for column' in str(ex))                                                         
        
    def testListOfCities(self):
        '''
        test sqlite3 with some 120000 city records
        '''
        listOfRecords=Sample.getCities()
        for fixDates in [True,False]:
            retrievedList=self.checkListOfRecords(listOfRecords,'City',fixDates=fixDates)
            self.assertEqual(len(listOfRecords),len(retrievedList))
            
    def testQueryParams(self):
        '''
        test Query Params
        '''
        listOfDicts=[
            {"city": "New York", "country": "US"},
            {"city": "Amsterdam", "country": "NL"},
            {"city": "Paris", "country": "FR"}]
        sqlDB=SQLDB(debug=self.debug,errorDebug=True)
        entityInfo=sqlDB.createTable(listOfDicts[:10],'cities','city')
        sqlDB.store(listOfDicts,entityInfo,executeMany=False)
        query="SELECT * from cities WHERE country in (?)"
        params=('FR',)
        frCities=sqlDB.query(query,params)
        if self.debug:
            print (frCities);
        self.assertEqual([{'city': 'Paris', 'country': 'FR'}],frCities)
        
    def testSqllite3Speed(self):
        '''
        test sqlite3 speed with some 100000 artificial sample records
        consisting of two columns with a running index
        '''
        limit=100000
        listOfRecords=Sample.getSample(limit)
        self.checkListOfRecords(listOfRecords, 'Sample', 'pKey')  
        
    def testIssue87AllowUsingQueryWithGenerator(self):
        '''
        test the query gen approach
        '''   
        debug=self.debug
        #debug=True
        sqlDB=self.getSampleTableDB(sampleSize=5)
        sqlQuery="select * FROM sample"
        for cindex,record in enumerate(sqlDB.queryGen(sqlQuery)):
            if debug:
                print(record)
            self.assertEqual(cindex,record["cindex"])

    def testBackup(self):
        '''
        test creating a backup of the SQL database
        '''
        if sys.version_info >= (3, 7):
            listOfRecords=Sample.getCities()
            self.checkListOfRecords(listOfRecords,'City',fixDates=True,doClose=False)
            backupDB="/tmp/testSqlite.db"
            showProgress=200 if self.debug else 0
            self.sqlDB.backup(backupDB,profile=self.debug,showProgress=showProgress)
            size=os.stat(backupDB).st_size
            if self.debug:
                print ("size of backup DB is %d" % size)
            self.assertTrue(size>600000)
            self.sqlDB.close()
            # restore
            ramDB=SQLDB.restore(backupDB, SQLDB.RAM, profile=self.debug,showProgress=showProgress)
            entityInfo=EntityInfo(listOfRecords[:50],'City',debug=self.debug)
            allCities=ramDB.queryAll(entityInfo)
            self.assertEqual(len(allCities),len(listOfRecords))
            
    def testCopy(self):
        '''
        test copying databases into another database
        '''
        dbFile="/tmp/DAWT_Sample3x1000.db"
        copyDB=SQLDB(dbFile)
        for sampleNo in range(3):
            listOfRecords=Sample.getSample(1000)
            self.checkListOfRecords(listOfRecords, 'Sample_%d_1000' %sampleNo, 'pKey',doClose=False)  
            self.sqlDB.copyTo(copyDB)
        size=os.stat(dbFile).st_size
        if self.debug:
            print ("size of copy DB is %d" % size)
        self.assertTrue(size>70000)
        tableList=copyDB.getTableList()
        if self.debug:
            print(tableList)
        for sampleNo in range(3):
            self.assertEqual('Sample_%d_1000' %sampleNo,tableList[sampleNo]['name'])
        # check that database is writable
        # https://stackoverflow.com/a/44707371/1497139
        copyDB.execute("pragma user_version=0")
        
    @staticmethod
    def getSampleTableDB(withDrop=False,debug=False,failIfTooFew=False,sampleSize=1000):
        listOfRecords=Sample.getSample(sampleSize)
        sqlDB=SQLDB()
        entityName="sample"
        primaryKey='pKey'
        sampleRecordCount=sampleSize*10
        sqlDB.debug=debug
        entityInfo=sqlDB.createTable(listOfRecords, entityName, primaryKey=primaryKey, withDrop=withDrop, sampleRecordCount=sampleRecordCount,failIfTooFew=failIfTooFew)
        executeMany=True
        fixNone=True
        sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone)
        return sqlDB
        
    def testIssue16(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/16
        allow to only warn if samplerecordcount is higher than number of available records
        '''
        self.getSampleTableDB(withDrop=False, debug=True,failIfTooFew=False)
        try:
            self.getSampleTableDB(withDrop=True, debug=True,failIfTooFew=True)
            self.fail("There should be an exception that too few sample records where provided")
        except Exception as ex:
            self.assertTrue("only 1000/10000 of needed sample records to createTable available" in str(ex))
        
    def testIssue18(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/18
        '''
        sqlDB=self.getSampleTableDB()
        tableDict=sqlDB.getTableDict()
        if self.debug:
            print (tableDict)
        self.assertTrue("sample" in tableDict)
        cols=tableDict["sample"]["columns"]
        self.assertTrue("pkey" in cols)
Esempio n. 7
0
    def getWikidataCityPopulation(self,sqlDB,endpoint=None):
        '''
        Args:
            sqlDB(SQLDB): target SQL database
            endpoint(str): url of the wikidata endpoint or None if default should be used
        '''
        dbFile=self.db_path+"/city_wikidata_population.db"
        rawTableName="cityPops"
        # is the wikidata population database available?
        if not os.path.exists(dbFile):
            # shall we created it from a wikidata query?
            if endpoint is not None:
                wikidata=Wikidata()
                wikidata.endpoint=endpoint
                cityList=wikidata.getCityPopulations()
                wikiCitiesDB=SQLDB(dbFile) 
                entityInfo=wikiCitiesDB.createTable(cityList[:300],rawTableName,primaryKey=None,withDrop=True)
                wikiCitiesDB.store(cityList,entityInfo,fixNone=True)
            else:
                # just download a copy 
                print("Downloading %s ... this might take a few seconds" % dbFile)
                dbUrl="http://wiki.bitplan.com/images/confident/city_wikidata_population.db"
                urllib.request.urlretrieve(dbUrl,dbFile)
        # (re) open the database
        wikiCitiesDB=SQLDB(dbFile) 
          
        # check whether the table is populated
        tableList=sqlDB.getTableList()        
        tableName="citiesWithPopulation"     
      
        if self.db_recordCount(tableList, tableName)<10000:
            # check that database is writable
            # https://stackoverflow.com/a/44707371/1497139
            sqlDB.execute("pragma user_version=0")
            # makes sure both tables are in target sqlDB
            wikiCitiesDB.copyTo(sqlDB)
            # create joined table
            sqlQuery="""
              select 
    geoname_id,
    city_name,
    cp.cityLabel,
    country_iso_code,
    country_name,
    subdivision_1_iso_code,
    subdivision_1_name,
    cp.city as wikidataurl,
    cp.cityPop 
  from cities c 
  join cityPops cp 
  on c.geoname_id=cp.geoNameId 
union  
  select 
    geoNameId as geoname_id,
    null as city_name,
    cityLabel,
    countryIsoCode as country_iso_code,
    countryLabel as country_name,
    null as subdivision_1_iso_code,
    null as subdivision_1_name,
    city as wikidataurl,
    cityPop 
  from cityPops 
  where cityPop is not Null
group by geoNameId
order by cityPop desc
            """
            cityList=sqlDB.query(sqlQuery) 
            entityInfo=sqlDB.createTable(cityList,tableName,primaryKey=None,withDrop=True,sampleRecordCount=500)
            sqlDB.store(cityList,entityInfo,fixNone=True)