def store(self, cacheFileName='Event_all'): ''' store my contents to the given cacheFileName - implemented as SQL storage Args: cacheFileName(string): the path to the database ''' dbfile = Lookup.getDBFile(cacheFileName) # remove existing database dump if it exists if os.path.exists(dbfile): os.remove(dbfile) backup = SQLDB(dbfile) errors = [] print("storing %s to %s" % (self.name, dbfile)) # debugging of CREATE TABLE # backup.c.execute("CREATE TABLE Event_test(foundBy TEXT)") for em in self.ems: if not em.config.mode is StoreMode.SQL: raise Exception( "lookup store only support SQL storemode but found %s for %s" % (em.config.mode, em.name)) else: cacheFile = em.getCacheFile(config=em.config, mode=StoreMode.SQL) sqlDB = em.getSQLDB(cacheFile) sqlDB.copyTo(backup) backup.close() return errors
def checkListOfRecords(self,listOfRecords,entityName,primaryKey=None,executeMany=True,fixNone=False,fixDates=False,debug=False,doClose=True): ''' check the handling of the given list of Records Args: listOfRecords(list): a list of dicts that contain the data to be stored entityName(string): the name of the entity type to be used as a table name primaryKey(string): the name of the key / column to be used as a primary key executeMany(boolean): True if executeMany mode of sqlite3 should be used fixNone(boolean): fix dict entries that are undefined to have a "None" entry debug(boolean): True if debug information e.g. CREATE TABLE and INSERT INTO commands should be shown doClose(boolean): True if the connection should be closed ''' size=len(listOfRecords) if self.debug: print("%s size is %d fixNone is %r fixDates is: %r" % (entityName,size,fixNone,fixDates)) self.sqlDB=SQLDB(debug=debug,errorDebug=True) entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityName,primaryKey) startTime=time.time() self.sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone) elapsed=time.time()-startTime if self.debug: print ("adding %d %s records took %5.3f s => %5.f records/s" % (size,entityName,elapsed,size/elapsed)) resultList=self.sqlDB.queryAll(entityInfo,fixDates=fixDates) if self.debug: print ("selecting %d %s records took %5.3f s => %5.f records/s" % (len(resultList),entityName,elapsed,len(resultList)/elapsed)) if doClose: self.sqlDB.close() return resultList
def copyFrom(self, dbFile): ''' copy the contents of the given database file to my database Args: dbFile(string): the database to copy the content from ''' sqlDB = self.getSQLDB() sourceDB = SQLDB(dbFile) sourceDB.copyTo(sqlDB)
def getDocumentRecords(self,archiveName,folderPath): ''' get the document records ''' sqlDB=SQLDB(self.getCacheFile()) sqlQuery="SELECT * FROM document WHERE archiveName=(?) AND folderPath=(?)" params=(archiveName,folderPath,) dictList=sqlDB.query(sqlQuery, params) return dictList
def testGeoTextAndGrapy(self): ''' test the GeoText and geograpy3 library ''' debug = True limit = 100 sqlQuery = """select count(*) as count, locality from Event_wikicfp where locality is not null group by locality order by 1 desc LIMIT %d """ % limit dbFile = Lookup.getDBFile() if os.path.isfile(dbFile): sqlDB = SQLDB(dbFile) else: lookup = Lookup.ensureAllIsAvailable("testGeoText") sqlDB = lookup.getSQLDB() if sqlDB is not None: print("testGeoText from database %s " % sqlDB.dbname) totalResult = sqlDB.query("""select count(*) as count from event_wikicfp where locality is not null""") total = totalResult[0]['count'] listOfDicts = sqlDB.query(sqlQuery) index = 0 rsum = 0 found = 0 problems = [] for record in listOfDicts: locality = record['locality'] count = record['count'] index += 1 rsum += count print("%5d: %5d/%5d %5.1f%%=%s" % (index, count, rsum, rsum / total * 100, locality)) geo = GeoText(locality) if debug: print(" %s" % geo.countries) print(" %s" % geo.cities) city = geograpy.locateCity(locality) if city is not None: found += 1 else: problems.append(locality) if debug: print(" %s%s" % (city, '✅' if city is not None else '❌')) if self.debug: print("found %d/%d = %5.1f%%" % (found, limit, found / limit * 100)) print("problems: %s" % problems) self.assertTrue(found / limit > 0.8) pass
def testWikidataCities(self): ''' test getting city information from wikidata ''' # Wikidata time outs in CI environment need to be avoided if getpass.getuser() != "wf": return config = StorageConfig.getSQL(debug=self.debug) config.cacheRootDir = "/tmp/wdhs" cachedir = config.getCachePath() config.cacheFile = f"{cachedir}/hs.db" # use 2018 wikidata copy # wikidata.endpoint="http://blazegraph.bitplan.com/sparql" # use 2020 wikidata copy wikidata = Wikidata() wikidata.endpoint = "https://confident.dbis.rwth-aachen.de/jena/wdhs/sparql" #wikidata.endpoint="http://jena.bitplan.com/wdhs/sparql" regions = [{ "name": "Singapore", "country": "Q334", "region": None, "cities": 46 }, { "name": "Beijing", "country": None, "region": "Q956", "cities": 25 }, { "name": "Paris", "country": None, "region": "Q13917", "cities": 1242 }, { "name": "Barcelona", "country": None, "region": "Q5705", "cities": 1242 }, { "name": "Rome", "country": None, "region": "Q1282", "cities": 1242 }] limit = 1000000 #if self.inCI() else 100 cityList = wikidata.getCities(limit=limit) sqlDB = SQLDB(config.cacheFile) entityInfo = sqlDB.createTable(cityList, "hs", withDrop=True) sqlDB.store(cityList, entityInfo, fixNone=True) expected = 200000 # if self.inCI() else limit self.assertTrue(len(cityList) >= expected)
def populate_Cities_FromWikidata(self,sqlDB): ''' populate the given sqlDB with the Wikidata Cities Args: sqlDB(SQLDB): target SQL database ''' dbFile=self.db_path+"/City_wikidata.db" if not os.path.exists(dbFile): print("Downloading %s ... this might take a few seconds" % dbFile) dbUrl="http://wiki.bitplan.com/images/confident/City_wikidata.db" urllib.request.urlretrieve(dbUrl,dbFile) wikiCitiesDB=SQLDB(dbFile) wikiCitiesDB.copyTo(sqlDB)
def testIssue41(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/41 improve error message when create table command fails ''' listOfRecords=[{ 'name':'value', 'py/object': 'datetime.time' }] self.sqlDB=SQLDB(debug=self.debug,errorDebug=True) try: _entityInfo=self.sqlDB.createTable(listOfRecords[:1],'Invalid','name') self.fail("There should be an exception") except Exception as ex: self.assertTrue("CREATE TABLE Invalid" in str(ex))
def getSqlDB(): ''' get the SQlite database connection ''' config=DMSStorage.getStorageConfig(mode="sql") # https://stackoverflow.com/a/48234567/1497139 sqlDB=SQLDB(config.cacheFile,check_same_thread=False) return sqlDB
def __init__(self, db_file=None,correctMisspelling=False,debug=False): ''' Constructor Args: db_file(str): the path to the database file correctMispelling(bool): if True correct typical misspellings debug(bool): if True show debug information ''' self.debug=debug self.correctMisspelling=correctMisspelling self.db_path=os.path.dirname(os.path.realpath(__file__)) self.db_file = db_file or self.db_path+"/locs.db" self.view="GeoLite2CityLookup" self.sqlDB=SQLDB(self.db_file,errorDebug=True) self.getAliases() self.dbVersion="2020-09-27 16:48:09"
def getWordUsageDB(self): ''' get the Word usage database ''' wordUsageDBFile = Lookup.getDBFile("wordusage") if os.path.isfile(wordUsageDBFile): wSQLDB = SQLDB(wordUsageDBFile) return wSQLDB return None
def testCacheLocationLabels(self): ''' Generates the location label tabels in the SQL db fro countries, regions and cities by querying wikidata for the rdfs:label and skos:altLa of each location. A view containing all location labels is also created. ''' testLocationLabelExtraction = False if testLocationLabelExtraction: wd = Wikidata() config = LocationContext.getDefaultConfig() countryManager = CountryManager(config=config) regionManager = RegionManager(config=config) cityManager = CityManager(config=config) sqlDb = SQLDB(dbname=config.cacheFile, debug=self.debug) for manager in countryManager, regionManager, cityManager: manager.fromCache() wikidataIdQuery = f"SELECT DISTINCT wikidataid FROM {manager.entityPluralName}" wikidataIdQueryRes = sqlDb.query(wikidataIdQuery) wikidataIds = [l['wikidataid'] for l in wikidataIdQueryRes] chunkSize = 1000 iterations = math.ceil(len(wikidataIds) / chunkSize) progress = 0 res = [] for i in range(iterations): workOnIds = wikidataIds[i * chunkSize:(i + 1) * chunkSize] progress += len(workOnIds) index = 0 values = "" for location in workOnIds: spacer = " \n\t\t\t" if index % 10 == 0 else " " values += f"{spacer}wd:{wd.getWikidataId(location)}" index += 1 query = self.getLablesQuery(values) res.extend( wd.query( f"Query {i}/{iterations} - Querying {manager.entityName} Labels", queryString=query)) wd.store2DB(res, tableName=f"{manager.entityName}_labels", sqlDB=sqlDb) self.createViews(sqlDB=sqlDb)
def testIssue14_execute(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/14 offer execute wrapper directly via sqlDB ''' sqlDB=SQLDB() ddl=""" CREATE TABLE contacts ( contact_id INTEGER PRIMARY KEY, first_name TEXT NOT NULL, last_name TEXT NOT NULL ) """ sqlDB.execute(ddl) tableList=sqlDB.getTableList() if self.debug: print(tableList) self.assertEqual(1,len(tableList)) self.assertEqual("contacts",tableList[0]['name'])
def testPlantUml(self): ''' get plant UML functionality ''' schemaManager = None if getpass.getuser() != "travis": o = Ontology() schemaManager = o.getRQSchema( fromCache=False) # to force SMW query lookup = Lookup("plantuml", getAll=False, butNot='or') dbfile = lookup.getDBFile('Event_all') sqlDB = SQLDB(dbfile) tableList = sqlDB.getTableList() eventTableList = [] eventSchemas = lookup.getEventSchemas() for table in tableList: tableName = table['name'] if tableName.startswith("Event_"): table['schema'] = eventSchemas[tableName] eventTableList.append(table) countQuery = "SELECT count(*) as count from %s" % tableName countResult = sqlDB.query(countQuery) table['instances'] = countResult[0]['count'] self.assertEqual(8, len(eventTableList)) uml = UML() now = datetime.now() nowYMD = now.strftime("%Y-%m-%d") title = """ConfIDent Entities %s [[https://projects.tib.eu/en/confident/ © 2019-2020 ConfIDent project]] see also [[http://ptp.bitplan.com/settings Proceedings Title Parser]] """ % nowYMD plantUml = uml.mergeSchema(schemaManager, eventTableList, title=title, packageName='DataDonations', generalizeTo="Event") print(plantUml) self.assertTrue("Event <|-- Event_confref" in plantUml) self.assertTrue("class Event " in plantUml)
def getSQLDB(self, cacheFile): ''' get the SQL database for the given cacheFile Args: cacheFile(string): the file to get the SQL db from ''' config = self.config sqldb = self.sqldb = SQLDB(cacheFile, debug=config.debug, errorDebug=config.errorDebug) return sqldb
def testUniqueConstraint(self): ''' test for https://github.com/WolfgangFahl/pyLoDStorage/issues/4 sqlite3.IntegrityError: UNIQUE constraint failed: ... show debug info ''' listOfDicts=[ {"name": "John Doe"}, {"name": "Frank Doe"}, {"name": "John Doe"}, {"name":"Tim Doe"}] sqlDB=SQLDB(debug=self.debug,errorDebug=True) entityInfo=sqlDB.createTable(listOfDicts[:10],'Does','name') try: sqlDB.store(listOfDicts,entityInfo,executeMany=False) self.fail("There should be an exception") except Exception as ex: expected="""INSERT INTO Does (name) values (:name) failed:UNIQUE constraint failed: Does.name record #3={'name': 'John Doe'}""" errMsg=str(ex) self.assertEqual(expected,errMsg)
def testCopy(self): ''' test copying databases into another database ''' dbFile="/tmp/DAWT_Sample3x1000.db" copyDB=SQLDB(dbFile) for sampleNo in range(3): listOfRecords=Sample.getSample(1000) self.checkListOfRecords(listOfRecords, 'Sample_%d_1000' %sampleNo, 'pKey',doClose=False) self.sqlDB.copyTo(copyDB) size=os.stat(dbFile).st_size if self.debug: print ("size of copy DB is %d" % size) self.assertTrue(size>70000) tableList=copyDB.getTableList() if self.debug: print(tableList) for sampleNo in range(3): self.assertEqual('Sample_%d_1000' %sampleNo,tableList[sampleNo]['name']) # check that database is writable # https://stackoverflow.com/a/44707371/1497139 copyDB.execute("pragma user_version=0")
def getFolder(self,archive,folderPath:str): ''' get the folder for the given archive and folderPath Args: archive: the archive folderPath: the path of the folder ''' sqlDB=SQLDB(self.getCacheFile()) sqlQuery="SELECT * FROM folder WHERE archiveName=(?) AND path=(?)" archiveName=archive.name params=(archiveName,folderPath,) records=sqlDB.query(sqlQuery, params) folder=None if len(records)>1: msg=f"{len(records)} folders found for {archiveName}:{folderPath} - there should be only one" raise Exception(msg) elif len(records)==1: folder=Folder() folder.fromDict(records[0]) folder.archive=archive return folder
def getSQLDB(self, cacheFileName='Event_all'): ''' get the SQLDB Args: cacheFileName(string): prefix of database file name Returns: SQLDB: the SQLDB access ''' dbfile = Lookup.getDBFile(cacheFileName) sqlDB = SQLDB(dbfile) return sqlDB
def testIssue15(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/15 auto create view ddl in mergeschema ''' self.sqlDB=SQLDB(debug=self.debug,errorDebug=self.debug) listOfRecords=Sample.getRoyals() entityInfo=EntityInfo(listOfRecords[:3],'Person','name',debug=self.debug) entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityInfo.name,entityInfo.primaryKey) listOfRecords=[{'name': 'Royal family', 'country': 'UK', 'lastmodified':datetime.now()}] entityInfo=self.sqlDB.createTable(listOfRecords[:10],'Family','name') tableList=self.sqlDB.getTableList() viewDDL=Schema.getGeneralViewDDL(tableList,"PersonBase") if self.debug: print (viewDDL) expected="""CREATE VIEW PersonBase AS SELECT name,lastmodified FROM Person UNION SELECT name,lastmodified FROM Family""" self.assertEqual(expected,viewDDL) pass
def testWordParser(self): ''' try finding quantiles see https://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy ''' lookup = Lookup("test Word parser") sqlDB = lookup.getSQLDB() if sqlDB is not None: totalWordUsages = [] for source in ['wikidata', 'crossref', 'dblp', 'CEUR-WS']: listOfDicts = TestWordParser.getProceedingsTitles( sqlDB, source) cwp = CorpusWordParser() wordusages = cwp.parse(listOfDicts) lens = {} for wordusage in wordusages: totalWordUsages.append(wordusage.__dict__) if wordusage.eventId in lens: lens[wordusage.eventId] += 1 else: lens[wordusage.eventId] = 1 df = DataFrame(lens.values()) print(df.quantile(1)) quantileValues = df.quantile(.90) print(quantileValues) plot = Plot(lens.values(), "%s wordcount histogram" % source, xlabel="wordcount", ylabel="frequency") plot.hist(mode='save') wordUsageDBFile = Lookup.getDBFile("wordusage") wSQLDB = SQLDB(wordUsageDBFile) entityInfo = wSQLDB.createTable(totalWordUsages, "wordusage", withDrop=True) wSQLDB.store(totalWordUsages, entityInfo)
def getSampleTableDB(withDrop=False,debug=False,failIfTooFew=False,sampleSize=1000): listOfRecords=Sample.getSample(sampleSize) sqlDB=SQLDB() entityName="sample" primaryKey='pKey' sampleRecordCount=sampleSize*10 sqlDB.debug=debug entityInfo=sqlDB.createTable(listOfRecords, entityName, primaryKey=primaryKey, withDrop=withDrop, sampleRecordCount=sampleRecordCount,failIfTooFew=failIfTooFew) executeMany=True fixNone=True sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone) return sqlDB
def setSingleDBConfig(self, config): ''' set the cacheFile to the singleDB "Event_all.db" Args: config(Storareconfig): the storage configuration to use ''' # if event_all.db is available use it ... config.cacheFile = Lookup.getDBFile() # if cacheFile is not available if not os.path.isfile(config.cacheFile): config.errors = ['cachefile %s is missing' % config.cacheFile] config.cacheFile = None else: config.errors = self.check(SQLDB(config.cacheFile), debug=self.debug) # make sure the event_all db is complete if len(config.errors) > 0: config.cacheFile = None else: config.singleDB = True
def testBackup(self): ''' test creating a backup of the SQL database ''' if sys.version_info >= (3, 7): listOfRecords=Sample.getCities() self.checkListOfRecords(listOfRecords,'City',fixDates=True,doClose=False) backupDB="/tmp/testSqlite.db" showProgress=200 if self.debug else 0 self.sqlDB.backup(backupDB,profile=self.debug,showProgress=showProgress) size=os.stat(backupDB).st_size if self.debug: print ("size of backup DB is %d" % size) self.assertTrue(size>600000) self.sqlDB.close() # restore ramDB=SQLDB.restore(backupDB, SQLDB.RAM, profile=self.debug,showProgress=showProgress) entityInfo=EntityInfo(listOfRecords[:50],'City',debug=self.debug) allCities=ramDB.queryAll(entityInfo) self.assertEqual(len(allCities),len(listOfRecords))
def testQueryParams(self): ''' test Query Params ''' listOfDicts=[ {"city": "New York", "country": "US"}, {"city": "Amsterdam", "country": "NL"}, {"city": "Paris", "country": "FR"}] sqlDB=SQLDB(debug=self.debug,errorDebug=True) entityInfo=sqlDB.createTable(listOfDicts[:10],'cities','city') sqlDB.store(listOfDicts,entityInfo,executeMany=False) query="SELECT * from cities WHERE country in (?)" params=('FR',) frCities=sqlDB.query(query,params) if self.debug: print (frCities); self.assertEqual([{'city': 'Paris', 'country': 'FR'}],frCities)
class Locator(object): ''' location handling ''' # singleton instance locator=None def __init__(self, db_file=None,correctMisspelling=False,debug=False): ''' Constructor Args: db_file(str): the path to the database file correctMispelling(bool): if True correct typical misspellings debug(bool): if True show debug information ''' self.debug=debug self.correctMisspelling=correctMisspelling self.db_path=os.path.dirname(os.path.realpath(__file__)) self.db_file = db_file or self.db_path+"/locs.db" self.view="GeoLite2CityLookup" self.sqlDB=SQLDB(self.db_file,errorDebug=True) self.getAliases() self.dbVersion="2020-09-27 16:48:09" @staticmethod def resetInstance(): Locator.locator=None @staticmethod def getInstance(correctMisspelling=False,debug=False): ''' get the singleton instance of the Locator. If parameters are changed on further calls the initial parameters will still be in effect since the original instance will be returned! Args: correctMispelling(bool): if True correct typical misspellings debug(bool): if True show debug information ''' if Locator.locator is None: Locator.locator=Locator(correctMisspelling=correctMisspelling,debug=debug) return Locator.locator def locateCity(self,places): ''' locate a city, region country combination based on the given wordtoken information Args: places(list): a list of places derived by splitting a locality e.g. "San Francisco, CA" leads to "San Francisco", "CA" Returns: City: a city with country and region details ''' # make sure the database is populated self.populate_db() country=None cities=[] regions=[] # loop over all word elements for place in places: place=place.strip() if place in self.aliases: place=self.aliases[place] foundCountry=self.getCountry(place) if foundCountry is not None: country=foundCountry foundCities=self.cities_for_name(place) cities.extend(foundCities) foundRegions=self.regions_for_name(place) regions.extend(foundRegions) foundCity=self.disambiguate(country, regions, cities) return foundCity def isISO(self,s): ''' check if the given string is an ISO code Returns: bool: True if the string is an ISO Code ''' m=re.search(r"^([A-Z]{1,2}\-)?[0-9A-Z]{1,3}$",s) result=m is not None return result def disambiguate(self,country,regions,cities,byPopulation=True): ''' try determining country, regions and city from the potential choices Args: country(Country): a matching country found regions(list): a list of matching Regions found cities(list): a list of matching cities found Return: City: the found city or None ''' if self.debug: print("countries: %s " % country) print("regions: %s" % "\n\t".join(str(r) for r in regions)) print("cities: %s" % "\n\t".join(str(c) for c in cities)) foundCity=None # is the city information unique? if len(cities)==1: foundCity=cities[0] else: if len(cities)>1: if country is not None: for city in cities: if self.debug: print("city %s: " %(city)) if city.country.iso==country.iso: foundCity=city break if foundCity is None and len(regions)>0: for region in regions: for city in cities: if city.region.iso==region.iso and not city.region.name==city.name: foundCity=city break; if foundCity is not None: break if foundCity is None and byPopulation: foundCity=max(cities,key=lambda city:0 if city.population is None else city.population) pass return foundCity def cities_for_name(self, cityName): ''' find cities with the given cityName Args: cityName(string): the potential name of a city Returns: a list of city records ''' cities=[] for column in ['name','wikidataName']: cityRecords=self.places_by_name(cityName, column) for cityRecord in cityRecords: cities.append(City.fromGeoLite2(cityRecord)) return cities def regions_for_name(self, region_name): ''' get the regions for the given region_name (which might be an ISO code) Args: region_name(string): region name Returns: list: the list of cities for this region ''' regions=[] if self.isISO(region_name): columnName="regionIsoCode" else: columnName='regionLabel' query="SELECT * from regions WHERE %s = (?)" % (columnName) params=(region_name,) regionRecords=self.sqlDB.query(query,params) for regionRecord in regionRecords: regions.append(Region.fromWikidata(regionRecord)) return regions def correct_country_misspelling(self, name): ''' correct potential misspellings Args: name(string): the name of the country potentially misspelled Return: string: correct name of unchanged ''' cur_dir = os.path.dirname(os.path.realpath(__file__)) with open(cur_dir + "/data/ISO3166ErrorDictionary.csv") as info: reader = csv.reader(info) for row in reader: if name in remove_non_ascii(row[0]): return row[2] return name def is_a_country(self, name): ''' check if the given string name is a country Args: name(string): the string to check Returns: True: if pycountry thinks the string is a country ''' country=self.getCountry(name) result=country is not None return result def getCountry(self,name): ''' get the country for the given name Args: name(string): the name of the country to lookup Returns: country: the country if one was found or None if not ''' if self.isISO(name): pcountry=pycountry.countries.get(alpha_2=name) else: if self.correctMisspelling: name = self.correct_country_misspelling(name) pcountry=pycountry.countries.get(name=name) country=None if pcountry is not None: country=Country.fromPyCountry(pcountry) #if country is None: # query="SELECT * FROM countries WHERE countryLabel = (?)""" # params=(name,) # countryRecords=self.sqlDB.query(query,params) # if len(countryRecords)>0: # pass return country def getView(self): ''' get the view to be used Returns: str: the SQL view to be used for CityLookups e.g. GeoLite2CityLookup ''' view=self.view return view def places_by_name(self, placeName, columnName): ''' get places by name and column Args: placeName(string): the name of the place columnName(string): the column to look at ''' if not self.db_has_data(): self.populate_db() view=self.getView() query='SELECT * FROM %s WHERE %s = (?)' % (view,columnName) params=(placeName,) cities=self.sqlDB.query(query,params) return cities def getGeolite2Cities(self): ''' get the Geolite2 City-Locations as a list of Dicts Returns: list: a list of Geolite2 City-Locator dicts ''' cities=self.readCSV("GeoLite2-City-Locations-en.csv") return cities def readCSV(self,fileName): records=[] cur_dir = os.path.dirname(os.path.realpath(__file__)) csvfile="%s/data/%s" % (cur_dir,fileName) with open(csvfile) as info: reader = csv.DictReader(info) for row in reader: records.append(row) return records def recreateDatabase(self): ''' recreate my lookup database ''' print("recreating database ... %s" % self.db_file) self.populate_db(force=True) def populate_db(self,force=False): ''' populate the cities SQL database which caches the information from the GeoLite2-City-Locations.csv file Args: force(bool): if True force a recreation of the database ''' hasData=self.db_has_data() if force: self.populate_Cities(self.sqlDB) self.populateFromWikidata(self.sqlDB) self.getWikidataCityPopulation(self.sqlDB) self.createViews(self.sqlDB) self.populate_Version(self.sqlDB) elif not hasData: url="http://wiki.bitplan.com/images/confident/locs.db.gz" zipped=self.db_file+".gz" print("Downloading %s from %s ... this might take a few seconds" % (zipped,url)) urllib.request.urlretrieve(url,zipped) print("unzipping %s from %s" % (self.db_file,zipped)) with gzip.open(zipped, 'rb') as gzipped: with open(self.db_file, 'wb') as unzipped: shutil.copyfileobj(gzipped, unzipped) if not os.path.isfile(self.db_file): raise("could not create lookup database %s" % self.db_file) def populate_Version(self,sqlDB): ''' populate the version table Args: sqlDB(SQLDB): target SQL database ''' versionList=[{"version":self.dbVersion}] entityInfo=sqlDB.createTable(versionList,"Version","version",withDrop=True) sqlDB.store(versionList,entityInfo) def getAliases(self): ''' get the aliases hashTable ''' aliases=self.readCSV("aliases.csv") self.aliases={} for alias in aliases: self.aliases[alias['name']]=alias['alias'] def populateFromWikidata(self,sqlDB): ''' populate countries and regions from Wikidata Args: sqlDB(SQLDB): target SQL database ''' self.populate_Countries(sqlDB) self.populate_Regions(sqlDB) return # ignore the following code as of 2020-09-26 self.populate_Cities_FromWikidata(sqlDB) viewDDLs=["DROP VIEW IF EXISTS WikidataCityLookup",""" CREATE VIEW WikidataCityLookup AS SELECT name AS name, regionLabel as regionName, regionIsoCode as regionIsoCode, countryLabel as countryName, countryIsoCode as countryIsoCode, cityPopulation as population, countryGDP_perCapita as gdp FROM City_wikidata """] # subdivision_1_name AS regionName, # subdivision_1_iso_code as regionIsoCode, # country_name AS countryName, # country_iso_code as countryIsoCode for viewDDL in viewDDLs: self.sqlDB.execute(viewDDL) def populate_Countries(self,sqlDB): ''' populate database with countries from wikiData Args: sqlDB(SQLDB): target SQL database ''' print("retrieving Country data from wikidata ... (this might take a few seconds)") wikidata=Wikidata() wikidata.getCountries() entityInfo=sqlDB.createTable(wikidata.countryList,"countries",None,withDrop=True,sampleRecordCount=200) sqlDB.store(wikidata.countryList,entityInfo,fixNone=True) def populate_Regions(self,sqlDB): ''' populate database with regions from wikiData Args: sqlDB(SQLDB): target SQL database ''' print("retrieving Region data from wikidata ... (this might take a minute)") wikidata=Wikidata() wikidata.getRegions() entityInfo=sqlDB.createTable(wikidata.regionList[:5000],"regions",primaryKey=None,withDrop=True) sqlDB.store(wikidata.regionList,entityInfo,fixNone=True) def populate_Cities_FromWikidata(self,sqlDB): ''' populate the given sqlDB with the Wikidata Cities Args: sqlDB(SQLDB): target SQL database ''' dbFile=self.db_path+"/City_wikidata.db" if not os.path.exists(dbFile): print("Downloading %s ... this might take a few seconds" % dbFile) dbUrl="http://wiki.bitplan.com/images/confident/City_wikidata.db" urllib.request.urlretrieve(dbUrl,dbFile) wikiCitiesDB=SQLDB(dbFile) wikiCitiesDB.copyTo(sqlDB) def getWikidataCityPopulation(self,sqlDB,endpoint=None): ''' Args: sqlDB(SQLDB): target SQL database endpoint(str): url of the wikidata endpoint or None if default should be used ''' dbFile=self.db_path+"/city_wikidata_population.db" rawTableName="cityPops" # is the wikidata population database available? if not os.path.exists(dbFile): # shall we created it from a wikidata query? if endpoint is not None: wikidata=Wikidata() wikidata.endpoint=endpoint cityList=wikidata.getCityPopulations() wikiCitiesDB=SQLDB(dbFile) entityInfo=wikiCitiesDB.createTable(cityList[:300],rawTableName,primaryKey=None,withDrop=True) wikiCitiesDB.store(cityList,entityInfo,fixNone=True) else: # just download a copy print("Downloading %s ... this might take a few seconds" % dbFile) dbUrl="http://wiki.bitplan.com/images/confident/city_wikidata_population.db" urllib.request.urlretrieve(dbUrl,dbFile) # (re) open the database wikiCitiesDB=SQLDB(dbFile) # check whether the table is populated tableList=sqlDB.getTableList() tableName="citiesWithPopulation" if self.db_recordCount(tableList, tableName)<10000: # check that database is writable # https://stackoverflow.com/a/44707371/1497139 sqlDB.execute("pragma user_version=0") # makes sure both tables are in target sqlDB wikiCitiesDB.copyTo(sqlDB) # create joined table sqlQuery=""" select geoname_id, city_name, cp.cityLabel, country_iso_code, country_name, subdivision_1_iso_code, subdivision_1_name, cp.city as wikidataurl, cp.cityPop from cities c join cityPops cp on c.geoname_id=cp.geoNameId union select geoNameId as geoname_id, null as city_name, cityLabel, countryIsoCode as country_iso_code, countryLabel as country_name, null as subdivision_1_iso_code, null as subdivision_1_name, city as wikidataurl, cityPop from cityPops where cityPop is not Null group by geoNameId order by cityPop desc """ cityList=sqlDB.query(sqlQuery) entityInfo=sqlDB.createTable(cityList,tableName,primaryKey=None,withDrop=True,sampleRecordCount=500) sqlDB.store(cityList,entityInfo,fixNone=True) # remove raw Table #sqlCmd="DROP TABLE %s " %rawTableName #sqlDB.execute(sqlCmd) def populate_Cities(self,sqlDB): ''' populate the given sqlDB with the Geolite2 Cities Args: sqlDB(SQLDB): the SQL database to use ''' cities=self.getGeolite2Cities() entityName="cities" primaryKey="geoname_id" entityInfo=sqlDB.createTable(cities[:100],entityName,primaryKey,withDrop=True) sqlDB.store(cities,entityInfo,executeMany=False) def createViews(self,sqlDB): viewDDLs=["DROP VIEW IF EXISTS GeoLite2CityLookup",""" CREATE VIEW GeoLite2CityLookup AS SELECT city_name AS name, cityLabel AS wikidataName, wikidataurl, cityPop, subdivision_1_name AS regionName, subdivision_1_iso_code as regionIsoCode, country_name AS countryName, country_iso_code as countryIsoCode FROM citiesWithPopulation """] for viewDDL in viewDDLs: sqlDB.execute(viewDDL) def db_recordCount(self,tableList,tableName): ''' count the number of records for the given tableName Args: tableList(list): the list of table to check tableName(str): the name of the table to check Returns int: the number of records found for the table ''' tableFound=False for table in tableList: if table['name']==tableName: tableFound=True break count=0 if tableFound: query="SELECT Count(*) AS count FROM %s" % tableName countResult=self.sqlDB.query(query) count=countResult[0]['count'] return count def db_has_data(self): ''' check whether the database has data / is populated Returns: boolean: True if the cities table exists and has more than one record ''' tableList=self.sqlDB.getTableList() hasCities=self.db_recordCount(tableList,"citiesWithPopulation")>10000 hasCountries=self.db_recordCount(tableList,"countries")>100 hasRegions=self.db_recordCount(tableList,"regions")>1000 hasVersion=self.db_recordCount(tableList,"Version")==1 versionOk=False if hasVersion: query="SELECT version from Version" dbVersionList=self.sqlDB.query(query) versionOk=dbVersionList[0]['version']==self.dbVersion #hasWikidataCities=self.db_recordCount(tableList,'City_wikidata')>100000 ok=hasVersion and versionOk and hasCities and hasRegions and hasCountries return ok
def getWikidataCityPopulation(self,sqlDB,endpoint=None): ''' Args: sqlDB(SQLDB): target SQL database endpoint(str): url of the wikidata endpoint or None if default should be used ''' dbFile=self.db_path+"/city_wikidata_population.db" rawTableName="cityPops" # is the wikidata population database available? if not os.path.exists(dbFile): # shall we created it from a wikidata query? if endpoint is not None: wikidata=Wikidata() wikidata.endpoint=endpoint cityList=wikidata.getCityPopulations() wikiCitiesDB=SQLDB(dbFile) entityInfo=wikiCitiesDB.createTable(cityList[:300],rawTableName,primaryKey=None,withDrop=True) wikiCitiesDB.store(cityList,entityInfo,fixNone=True) else: # just download a copy print("Downloading %s ... this might take a few seconds" % dbFile) dbUrl="http://wiki.bitplan.com/images/confident/city_wikidata_population.db" urllib.request.urlretrieve(dbUrl,dbFile) # (re) open the database wikiCitiesDB=SQLDB(dbFile) # check whether the table is populated tableList=sqlDB.getTableList() tableName="citiesWithPopulation" if self.db_recordCount(tableList, tableName)<10000: # check that database is writable # https://stackoverflow.com/a/44707371/1497139 sqlDB.execute("pragma user_version=0") # makes sure both tables are in target sqlDB wikiCitiesDB.copyTo(sqlDB) # create joined table sqlQuery=""" select geoname_id, city_name, cp.cityLabel, country_iso_code, country_name, subdivision_1_iso_code, subdivision_1_name, cp.city as wikidataurl, cp.cityPop from cities c join cityPops cp on c.geoname_id=cp.geoNameId union select geoNameId as geoname_id, null as city_name, cityLabel, countryIsoCode as country_iso_code, countryLabel as country_name, null as subdivision_1_iso_code, null as subdivision_1_name, city as wikidataurl, cityPop from cityPops where cityPop is not Null group by geoNameId order by cityPop desc """ cityList=sqlDB.query(sqlQuery) entityInfo=sqlDB.createTable(cityList,tableName,primaryKey=None,withDrop=True,sampleRecordCount=500) sqlDB.store(cityList,entityInfo,fixNone=True)
def main(cls, args): ''' command line activation with parsed args Args: args(list): the command line arguments ''' debug = args.debug endpoints = EndpointManager.getEndpoints(args.endpointPath) qm = QueryManager(lang=args.language, debug=debug, queriesPath=args.queriesPath) query = None queryCode = args.query endpointConf = None formats = None # preload ValueFormatter ValueFormatter.getFormats(args.formatsPath) if args.list: for name, query in qm.queriesByName.items(): print(f"{name}:{query.title}") elif args.listEndpoints: # list endpoints for endpoint in endpoints.values(): if hasattr(endpoint, "lang") and endpoint.lang == args.language: print(endpoint) elif args.queryName is not None: if debug or args.showQuery: print(f"named query {args.queryName}:") if args.queryName not in qm.queriesByName: raise Exception(f"named query {args.queryName} not available") query = qm.queriesByName[args.queryName] formats = query.formats queryCode = query.query if debug or args.showQuery: if hasattr(query, "description") and query.description is not None: print(query.description) if query is None: name = "?" if queryCode is None and args.queryFile is not None: queryFilePath = Path(args.queryFile) queryCode = queryFilePath.read_text() name = queryFilePath.stem query = Query(name="?", query=queryCode, lang=args.language) if queryCode: if debug or args.showQuery: print(f"{args.language}:\n{queryCode}") if args.endpointName: endpointConf = endpoints.get(args.endpointName) if args.language == "sparql": method = 'POST' if args.endpointName: endPointUrl = endpointConf.endpoint method = endpointConf.method query.tryItUrl = endpointConf.website query.database = endpointConf.database else: endPointUrl = query.endpoint if args.method: method = method sparql = SPARQL(endPointUrl, method=method) if args.prefixes and endpointConf is not None: queryCode = f"{endpointConf.prefixes}\n{queryCode}" if args.raw: qres = cls.rawQuery(endPointUrl, query=query.query, resultFormat=args.format, mimeType=args.mimeType) print(qres) return if "wikidata" in args.endpointName and formats is None: formats = ["*:wikidata"] qlod = sparql.queryAsListOfDicts(queryCode) elif args.language == "sql": sqlDB = SQLDB(endpointConf.endpoint) qlod = sqlDB.query(queryCode) else: raise Exception( f"language {args.language} not known/supported") if args.format is Format.csv: csv = CSV.toCSV(qlod) print(csv) elif args.format in [ Format.latex, Format.github, Format.mediawiki ]: doc = query.documentQueryResult(qlod, tablefmt=str(args.format), floatfmt=".0f") docstr = doc.asText() print(docstr) elif args.format in [Format.json ] or args.format is None: # set as default # https://stackoverflow.com/a/36142844/1497139 print(json.dumps(qlod, indent=2, sort_keys=True, default=str)) elif args.format in [Format.xml]: lod2xml = Lod2Xml(qlod) xml = lod2xml.asXml() print(xml) else: raise Exception(f"format {args.format} not supported yet")
class TestSQLDB(Basetest): ''' Test the SQLDB database wrapper ''' def checkListOfRecords(self,listOfRecords,entityName,primaryKey=None,executeMany=True,fixNone=False,fixDates=False,debug=False,doClose=True): ''' check the handling of the given list of Records Args: listOfRecords(list): a list of dicts that contain the data to be stored entityName(string): the name of the entity type to be used as a table name primaryKey(string): the name of the key / column to be used as a primary key executeMany(boolean): True if executeMany mode of sqlite3 should be used fixNone(boolean): fix dict entries that are undefined to have a "None" entry debug(boolean): True if debug information e.g. CREATE TABLE and INSERT INTO commands should be shown doClose(boolean): True if the connection should be closed ''' size=len(listOfRecords) if self.debug: print("%s size is %d fixNone is %r fixDates is: %r" % (entityName,size,fixNone,fixDates)) self.sqlDB=SQLDB(debug=debug,errorDebug=True) entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityName,primaryKey) startTime=time.time() self.sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone) elapsed=time.time()-startTime if self.debug: print ("adding %d %s records took %5.3f s => %5.f records/s" % (size,entityName,elapsed,size/elapsed)) resultList=self.sqlDB.queryAll(entityInfo,fixDates=fixDates) if self.debug: print ("selecting %d %s records took %5.3f s => %5.f records/s" % (len(resultList),entityName,elapsed,len(resultList)/elapsed)) if doClose: self.sqlDB.close() return resultList def testEntityInfo(self): ''' test creating entityInfo from the sample record ''' listOfRecords=Sample.getRoyals() entityInfo=EntityInfo(listOfRecords[:3],'Person','name',debug=True) self.assertEqual("CREATE TABLE Person(name TEXT PRIMARY KEY,born DATE,numberInLine INTEGER,wikidataurl TEXT,age FLOAT,ofAge BOOLEAN,lastmodified TIMESTAMP)",entityInfo.createTableCmd) self.assertEqual("INSERT INTO Person (name,born,numberInLine,wikidataurl,age,ofAge,lastmodified) values (:name,:born,:numberInLine,:wikidataurl,:age,:ofAge,:lastmodified)",entityInfo.insertCmd) self.sqlDB=SQLDB(debug=self.debug,errorDebug=True) entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityInfo.name,entityInfo.primaryKey) tableList=self.sqlDB.getTableList() if self.debug: print (tableList) self.assertEqual(1,len(tableList)) personTable=tableList[0] self.assertEqual("Person",personTable['name']) self.assertEqual(7,len(personTable['columns'])) uml=UML() plantUml=uml.tableListToPlantUml(tableList,packageName="Royals",withSkin=False) if self.debug: print(plantUml) expected="""package Royals { class Person << Entity >> { age : FLOAT born : DATE lastmodified : TIMESTAMP name : TEXT <<PK>> numberInLine : INTEGER ofAge : BOOLEAN wikidataurl : TEXT } } """ self.assertEqual(expected,plantUml) # testGeneralization listOfRecords=[{'name': 'Royal family', 'country': 'UK', 'lastmodified':datetime.now()}] entityInfo=self.sqlDB.createTable(listOfRecords[:10],'Family','name') tableList=self.sqlDB.getTableList() self.assertEqual(2,len(tableList)) uml=UML() plantUml=uml.tableListToPlantUml(tableList,generalizeTo="PersonBase",withSkin=False) if self.debug: print(plantUml) expected='''class PersonBase << Entity >> { lastmodified : TIMESTAMP name : TEXT <<PK>> } class Person << Entity >> { age : FLOAT born : DATE numberInLine : INTEGER ofAge : BOOLEAN wikidataurl : TEXT } class Family << Entity >> { country : TEXT } PersonBase <|-- Person PersonBase <|-- Family ''' self.assertEqual(expected,plantUml) def testIssue15(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/15 auto create view ddl in mergeschema ''' self.sqlDB=SQLDB(debug=self.debug,errorDebug=self.debug) listOfRecords=Sample.getRoyals() entityInfo=EntityInfo(listOfRecords[:3],'Person','name',debug=self.debug) entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityInfo.name,entityInfo.primaryKey) listOfRecords=[{'name': 'Royal family', 'country': 'UK', 'lastmodified':datetime.now()}] entityInfo=self.sqlDB.createTable(listOfRecords[:10],'Family','name') tableList=self.sqlDB.getTableList() viewDDL=Schema.getGeneralViewDDL(tableList,"PersonBase") if self.debug: print (viewDDL) expected="""CREATE VIEW PersonBase AS SELECT name,lastmodified FROM Person UNION SELECT name,lastmodified FROM Family""" self.assertEqual(expected,viewDDL) pass def testUniqueConstraint(self): ''' test for https://github.com/WolfgangFahl/pyLoDStorage/issues/4 sqlite3.IntegrityError: UNIQUE constraint failed: ... show debug info ''' listOfDicts=[ {"name": "John Doe"}, {"name": "Frank Doe"}, {"name": "John Doe"}, {"name":"Tim Doe"}] sqlDB=SQLDB(debug=self.debug,errorDebug=True) entityInfo=sqlDB.createTable(listOfDicts[:10],'Does','name') try: sqlDB.store(listOfDicts,entityInfo,executeMany=False) self.fail("There should be an exception") except Exception as ex: expected="""INSERT INTO Does (name) values (:name) failed:UNIQUE constraint failed: Does.name record #3={'name': 'John Doe'}""" errMsg=str(ex) self.assertEqual(expected,errMsg) def testSqlite3(self): ''' test sqlite3 with a few records from the royal family ''' listOfRecords=Sample.getRoyals() resultList=self.checkListOfRecords(listOfRecords, 'Person', 'name',debug=True) if self.debug: print(resultList) self.assertEqual(listOfRecords,resultList) def testIssue13_setNoneValue(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/13 set None value for undefined LoD entries ''' listOfRecords=[ { 'make': 'Ford','model':'Model T', 'color':'black'}, { 'make': 'VW', 'model':'beetle'} ] entityName="Car" primaryKey="Model" resultList=self.checkListOfRecords(listOfRecords, entityName, primaryKey,fixNone=True) if self.debug: print (resultList) def testIssue14_execute(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/14 offer execute wrapper directly via sqlDB ''' sqlDB=SQLDB() ddl=""" CREATE TABLE contacts ( contact_id INTEGER PRIMARY KEY, first_name TEXT NOT NULL, last_name TEXT NOT NULL ) """ sqlDB.execute(ddl) tableList=sqlDB.getTableList() if self.debug: print(tableList) self.assertEqual(1,len(tableList)) self.assertEqual("contacts",tableList[0]['name']) def testIssue41(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/41 improve error message when create table command fails ''' listOfRecords=[{ 'name':'value', 'py/object': 'datetime.time' }] self.sqlDB=SQLDB(debug=self.debug,errorDebug=True) try: _entityInfo=self.sqlDB.createTable(listOfRecords[:1],'Invalid','name') self.fail("There should be an exception") except Exception as ex: self.assertTrue("CREATE TABLE Invalid" in str(ex)) def testBindingError(self): ''' test list of Records with incomplete record leading to "You did not supply a value for binding 2" see https://bugs.python.org/issue41638 ''' listOfRecords=[{'name':'Pikachu', 'type':'Electric'},{'name':'Raichu' }] for executeMany in [True,False]: try: self.checkListOfRecords(listOfRecords,'Pokemon','name',executeMany=executeMany) self.fail("There should be an exception") except Exception as ex: if self.debug: print(str(ex)) self.assertTrue('no value supplied for column' in str(ex)) def testListOfCities(self): ''' test sqlite3 with some 120000 city records ''' listOfRecords=Sample.getCities() for fixDates in [True,False]: retrievedList=self.checkListOfRecords(listOfRecords,'City',fixDates=fixDates) self.assertEqual(len(listOfRecords),len(retrievedList)) def testQueryParams(self): ''' test Query Params ''' listOfDicts=[ {"city": "New York", "country": "US"}, {"city": "Amsterdam", "country": "NL"}, {"city": "Paris", "country": "FR"}] sqlDB=SQLDB(debug=self.debug,errorDebug=True) entityInfo=sqlDB.createTable(listOfDicts[:10],'cities','city') sqlDB.store(listOfDicts,entityInfo,executeMany=False) query="SELECT * from cities WHERE country in (?)" params=('FR',) frCities=sqlDB.query(query,params) if self.debug: print (frCities); self.assertEqual([{'city': 'Paris', 'country': 'FR'}],frCities) def testSqllite3Speed(self): ''' test sqlite3 speed with some 100000 artificial sample records consisting of two columns with a running index ''' limit=100000 listOfRecords=Sample.getSample(limit) self.checkListOfRecords(listOfRecords, 'Sample', 'pKey') def testIssue87AllowUsingQueryWithGenerator(self): ''' test the query gen approach ''' debug=self.debug #debug=True sqlDB=self.getSampleTableDB(sampleSize=5) sqlQuery="select * FROM sample" for cindex,record in enumerate(sqlDB.queryGen(sqlQuery)): if debug: print(record) self.assertEqual(cindex,record["cindex"]) def testBackup(self): ''' test creating a backup of the SQL database ''' if sys.version_info >= (3, 7): listOfRecords=Sample.getCities() self.checkListOfRecords(listOfRecords,'City',fixDates=True,doClose=False) backupDB="/tmp/testSqlite.db" showProgress=200 if self.debug else 0 self.sqlDB.backup(backupDB,profile=self.debug,showProgress=showProgress) size=os.stat(backupDB).st_size if self.debug: print ("size of backup DB is %d" % size) self.assertTrue(size>600000) self.sqlDB.close() # restore ramDB=SQLDB.restore(backupDB, SQLDB.RAM, profile=self.debug,showProgress=showProgress) entityInfo=EntityInfo(listOfRecords[:50],'City',debug=self.debug) allCities=ramDB.queryAll(entityInfo) self.assertEqual(len(allCities),len(listOfRecords)) def testCopy(self): ''' test copying databases into another database ''' dbFile="/tmp/DAWT_Sample3x1000.db" copyDB=SQLDB(dbFile) for sampleNo in range(3): listOfRecords=Sample.getSample(1000) self.checkListOfRecords(listOfRecords, 'Sample_%d_1000' %sampleNo, 'pKey',doClose=False) self.sqlDB.copyTo(copyDB) size=os.stat(dbFile).st_size if self.debug: print ("size of copy DB is %d" % size) self.assertTrue(size>70000) tableList=copyDB.getTableList() if self.debug: print(tableList) for sampleNo in range(3): self.assertEqual('Sample_%d_1000' %sampleNo,tableList[sampleNo]['name']) # check that database is writable # https://stackoverflow.com/a/44707371/1497139 copyDB.execute("pragma user_version=0") @staticmethod def getSampleTableDB(withDrop=False,debug=False,failIfTooFew=False,sampleSize=1000): listOfRecords=Sample.getSample(sampleSize) sqlDB=SQLDB() entityName="sample" primaryKey='pKey' sampleRecordCount=sampleSize*10 sqlDB.debug=debug entityInfo=sqlDB.createTable(listOfRecords, entityName, primaryKey=primaryKey, withDrop=withDrop, sampleRecordCount=sampleRecordCount,failIfTooFew=failIfTooFew) executeMany=True fixNone=True sqlDB.store(listOfRecords,entityInfo,executeMany=executeMany,fixNone=fixNone) return sqlDB def testIssue16(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/16 allow to only warn if samplerecordcount is higher than number of available records ''' self.getSampleTableDB(withDrop=False, debug=True,failIfTooFew=False) try: self.getSampleTableDB(withDrop=True, debug=True,failIfTooFew=True) self.fail("There should be an exception that too few sample records where provided") except Exception as ex: self.assertTrue("only 1000/10000 of needed sample records to createTable available" in str(ex)) def testIssue18(self): ''' https://github.com/WolfgangFahl/pyLoDStorage/issues/18 ''' sqlDB=self.getSampleTableDB() tableDict=sqlDB.getTableDict() if self.debug: print (tableDict) self.assertTrue("sample" in tableDict) cols=tableDict["sample"]["columns"] self.assertTrue("pkey" in cols)
def testEntityInfo(self): ''' test creating entityInfo from the sample record ''' listOfRecords=Sample.getRoyals() entityInfo=EntityInfo(listOfRecords[:3],'Person','name',debug=True) self.assertEqual("CREATE TABLE Person(name TEXT PRIMARY KEY,born DATE,numberInLine INTEGER,wikidataurl TEXT,age FLOAT,ofAge BOOLEAN,lastmodified TIMESTAMP)",entityInfo.createTableCmd) self.assertEqual("INSERT INTO Person (name,born,numberInLine,wikidataurl,age,ofAge,lastmodified) values (:name,:born,:numberInLine,:wikidataurl,:age,:ofAge,:lastmodified)",entityInfo.insertCmd) self.sqlDB=SQLDB(debug=self.debug,errorDebug=True) entityInfo=self.sqlDB.createTable(listOfRecords[:10],entityInfo.name,entityInfo.primaryKey) tableList=self.sqlDB.getTableList() if self.debug: print (tableList) self.assertEqual(1,len(tableList)) personTable=tableList[0] self.assertEqual("Person",personTable['name']) self.assertEqual(7,len(personTable['columns'])) uml=UML() plantUml=uml.tableListToPlantUml(tableList,packageName="Royals",withSkin=False) if self.debug: print(plantUml) expected="""package Royals { class Person << Entity >> { age : FLOAT born : DATE lastmodified : TIMESTAMP name : TEXT <<PK>> numberInLine : INTEGER ofAge : BOOLEAN wikidataurl : TEXT } } """ self.assertEqual(expected,plantUml) # testGeneralization listOfRecords=[{'name': 'Royal family', 'country': 'UK', 'lastmodified':datetime.now()}] entityInfo=self.sqlDB.createTable(listOfRecords[:10],'Family','name') tableList=self.sqlDB.getTableList() self.assertEqual(2,len(tableList)) uml=UML() plantUml=uml.tableListToPlantUml(tableList,generalizeTo="PersonBase",withSkin=False) if self.debug: print(plantUml) expected='''class PersonBase << Entity >> { lastmodified : TIMESTAMP name : TEXT <<PK>> } class Person << Entity >> { age : FLOAT born : DATE numberInLine : INTEGER ofAge : BOOLEAN wikidataurl : TEXT } class Family << Entity >> { country : TEXT } PersonBase <|-- Person PersonBase <|-- Family ''' self.assertEqual(expected,plantUml)