class Gazetteer(object): # sorting by population is a workaround for entries with multiple parents # (without the sorting loops occure) QUERY_HAS_PARENT = ''' SELECT parent_id FROM gazetteerentity ga JOIN locatedin ON (ga.id = locatedin.child_id) JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id) WHERE child_id = %d order by gb.population DESC LIMIT 1''' DEBUG = False def __init__(self): """ initializes the gazetteer object and the database connections """ self.db = PostgresqlDb(**DATABASE_CONNECTION['gazetteer']) self.db.connect() self.db2 = PostgresqlDb(**DATABASE_CONNECTION['geo_mapping']) self.db2.connect() def getGeoEntityDict(self, name=None, id=None, geoUrl=None): """ returns a list of GeoEntities matching the given information @param[in] name of the Entity @param[in] id the GeoNames id @param[in] geoUrl the entity's dictionary """ geoId = [] if name: geoId.extend(self.getIdFromName(self, name)) if id: geoId.append(id) if geoUrl: geoId.extend(self.getIdFromGeoUrl(geoUrl)) return self.getGeoEntityDictFromId(geoId) @MemoryCached def getIdFromName(self, name): """ returns the possible GeoNames ids for the given name @param[in] name @returns a list of GeoNames ids """ query = "SELECT id FROM vw_gazetteer WHERE name='%s' ORDER BY population DESC" % ( name.replace("'", "''")) res = [r['id'] for r in self.db.query(query)] #query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''") ) return [r['id'] for r in self.db.query(query)] def getIdFromGeoUrl(self, geoUrl): """ returns the geoId for the given geoUrl @param[in] geoUrl @returns a list of geonames ids matching the geoUrl """ geoUrl = geoUrl.split(GEO_ENTITY_SEPARATOR) join = [] where = [] for nr, name in enumerate(geoUrl): join.append("JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % ( nr, nr, nr, nr + 1, nr + 1, nr)) where.append("A%d.id IN (%s)" % (nr, ", ".join( map(str, self.getIdFromName(self, name))))) query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % ( nr, " ".join(join[:-1]), " AND ".join(where)) return [int(r['id']) for r in self.db.query(query)] # @MemoryCached def getGeoEntityDictFromId(self, id): """ returns the location of the GazetteerEntry ID @param id a list of geonames ids @return list of GeoEntities """ if id: q = "SELECT * FROM gazetteerentity LEFT JOIN countryInfo USING(id) WHERE id IN (%s)" % ", ".join( map(str, id)) res = self.db.query(q) if len(res) > 0: entities = [dict(list(self._getResultById(res, i).items())) for i in id] self._addGeoUrl(entities) return entities print("WARNING: no entities found for ", ", ".join(map(str, id))) return [] def _getResultById(self, l, id, idAttr="id"): """ returns the database column matching the given id @param[in] l the list of query results @param[in] id the row id to return @param[in] idAttr attribute to consider for the id """ return [e for e in l if e[idAttr] == id][0] def _addGeoUrl(self, entities): """ adds the geoUrl and level key to the given list of entities """ for entity in entities: idUrl, nameUrl = self._getGeoUrl(entity['id']) entity['geoUrl'] = GEO_ENTITY_SEPARATOR.join(nameUrl) entity['idUrl'] = idUrl # hierarchy level of the entity (e.g. eu>at => 2) entity['level'] = len(nameUrl) def _getGeoUrl(self, id): """ returns the geoUrl for the given entity @param[in] the geonames gazetteer id @returns two lists containing (geoIdPath, geoNamePath) """ geoNamePath = [self._getPreferredGeoName(id)] geoIdPath = [id] while id: parentLocationEntity = self._hasParent(id) if parentLocationEntity: parentLocationName = self._getPreferredGeoName( parentLocationEntity) if parentLocationEntity in geoIdPath: print("%s in %s" % (parentLocationName, geoNamePath)) break geoNamePath.append(parentLocationName) geoIdPath.append(parentLocationEntity) id = parentLocationEntity geoIdPath.reverse() geoNamePath.reverse() return (geoIdPath[1:], geoNamePath) # gets the preferred name for the location # @param id # @returns preferred name def _getPreferredGeoName(self, id): """ returns the preferred entry name for the given entity id @param[in] entity_id @returns the geo entity's name """ query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d ORDER BY (lang='en') DESC, (lang IS NULL) DESC, (lang = '') DESC, (short=TRUE and short IS NOT NULL) DESC, preferred DESC LIMIT 1''' % (id) result = self.db.query(query) if not result: raise GazetteerEntryNotFound(id, query) return Gazetteer.DEBUG and result[0]['name'] + "(%d)" % (id) or result[0]['name'] # checks if the given ID has a parent # @param ID of the child # @return false or ID of the parent def _hasParent(self, child_id): query = self.QUERY_HAS_PARENT % child_id result = self.db.query(query) # todo: is it necessary, that this functions can process multiple parents? # multiple parents (!) if result.__len__() > 1: print('### result > 1 ###') print(' child_id: %s' % child_id) print(' parent_id: %s ' % [e['parent_id'] for e in result]) # todo: does it make sense to fetch infinite loops if result == []: return 0 else: return result[0]['parent_id']
class Gazetteer(object): # sorting by population is a workaround for entries with multiple parents # (without the sorting loops occure) QUERY_HAS_PARENT = ''' SELECT parent_id FROM gazetteerentity ga JOIN locatedin ON (ga.id = locatedin.child_id) JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id) WHERE child_id = %d order by gb.population DESC LIMIT 1''' QUERY_CONTENT_ID = ''' SELECT gazetteer_id FROM content_id_gazeteer_id WHERE content_id = %d ''' QUERY_NAME = ''' SELECT entity_id, ispreferredname, lang, gazetteerentry_id FROM gazetteerentry_ordered_names WHERE name LIKE '%s' ''' DEBUG = False ## init - establishes the db-connections def __init__(self): """ implement me """ self.db = PostgresqlDb( **DATABASE_CONNECTION['gazetteer'] ) #PostgresqlDb.DEBUG=True self.db.connect() self.db2 = PostgresqlDb( **DATABASE_CONNECTION['geo_mapping'] ) self.db2.connect() @MemoryCached def getGeoNameFromContentID(self, content_id): """ returns the location of the content ID @param content_id @return list of locaions, e.g. ['Europa', 'France', 'Centre'] """ query = self.QUERY_CONTENT_ID % content_id result = self.db2.query(query) if result == []: return 'ContentID not found!' else: gaz_id = result[0]['gazetteer_id'] return Gazetteer.getGeoNameFromGeoId(self, gaz_id) ## returns the location of the GazetteerEntry ID # @param gazetteer-entry ID # @return list of locations, e.g. ['Europa', 'France', 'Centre'] @MemoryCached def getGeoNameFromGeoId(self, gazetteer_id): result = self.__getLocationTree(gazetteer_id) result.reverse() if result == []: return 'GazetteerID not found!' else: return result @MemoryCached def getGeoNameFromString(self, name): """ returns the geoname for the given string @param string @return a list of tuples (population, location) """ res = set() query = '''SELECT entity_id, population FROM gazetteerentry JOIN hasname ON (gazetteerentry.id = hasname.entry_id) JOIN gazetteerentity ON (gazetteerentity.id=hasname.entity_id) WHERE name = '%%s' AND (population > %d or feature_code in ('ADM1', 'ADM2', 'ADM3')) ''' % ( MIN_POPULATION) for result in self.db.query(query % name.replace("'", "''")): try: tmp = Gazetteer.getGeoNameFromGeoId(self, result['entity_id']) res.add( (result['population'], tuple(tmp)) ) except GazetteerEntryNotFound: pass return list(res) ## recursive function to build the full location tree # @param id # @returns list of locations def __getLocationTree(self, id): geoPath = [ self.__getPreferredGeoName( id ) ] geoIdPath = [ id ] while id: parentLocationEntity = self.__hasParent(id) if parentLocationEntity: parentLocationName = self.__getPreferredGeoName( parentLocationEntity ) if parentLocationEntity in geoIdPath: print "%s in %s" % (parentLocationName, geoPath) break geoPath.append( parentLocationName ) geoIdPath.append( parentLocationEntity ) id = parentLocationEntity return geoPath ## gets the preferred name for the location # @param id # @returns preferred name def __getPreferredGeoName(self, id): """ returns the preferred entry name for the given entity id @param[in] entity_id @returns the geo entity's name """ query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d ORDER BY (lang='en' and lang is not null) DESC, (short=TRUE and short IS NOT NULL) DESC, preferred DESC LIMIT 1''' % (id) result = self.db.query(query) if not result: raise GazetteerEntryNotFound(id, query) return Gazetteer.DEBUG and result[0]['name']+"(%d)" % (id) or result[0]['name'] ## checks if the given ID has a parent # @param ID of the child # @return false or ID of the parent def __hasParent(self, child_id): query = self.QUERY_HAS_PARENT % child_id result = self.db.query(query) # todo: is it necessary, that this functions can process multiple parents? # multiple parents (!) if result.__len__() > 1: print '### result > 1 ###' print ' child_id: %s' % child_id print ' parent_id: %s ' % [ e['parent_id'] for e in result ] # todo: does it make sense to fetch infinite loops if result == []: return 0 else: return result[0]['parent_id'] @MemoryCached def __getNameGeoId(self, name): """ returns the possible geoids for the given name @param[in] name @returns a list of geoids """ query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''") ) return [ r['entity_id'] for r in self.db.query( query ) ] def getGeoIdFromGeoUrl(self, geoUrl): """ returns the geoId forv the given geoUrl @param[in] geoUrl String or List containing the geoUrl @returns the geoId """ if isinstance(geoUrl, str): geoUrl = geoUrl.split("/") join = [] where = [] for nr, name in enumerate( geoUrl ): join.append("JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % (nr, nr, nr, nr+1, nr+1, nr) ) where.append( "A%d.id IN (%s)" % (nr, ", ".join( map(str, self.__getNameGeoId(self, name))) )) query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % ( nr, " ".join(join[:-1]), " AND ".join(where) ) return [ int(r['id']) for r in self.db.query( query ) ] def getGeoDict(self, geoId): """ returns a dictinary with all information about the given geoId """ print geoId query = "SELECT * FROM gazetteerentity WHERE id = %s" % geoId res = self.db.query( query ) if len(res)>0: return dict(res[0]) else: return {}
class Gazetteer(object): # sorting by population is a workaround for entries with multiple parents # (without the sorting loops occure) QUERY_HAS_PARENT = ''' SELECT parent_id FROM gazetteerentity ga JOIN locatedin ON (ga.id = locatedin.child_id) JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id) WHERE child_id = %d order by gb.population DESC LIMIT 1''' QUERY_CONTENT_ID = ''' SELECT gazetteer_id FROM content_id_gazeteer_id WHERE content_id = %d ''' QUERY_NAME = ''' SELECT entity_id, ispreferredname, lang, gazetteerentry_id FROM gazetteerentry_ordered_names WHERE name LIKE '%s' ''' DEBUG = False ## init - establishes the db-connections def __init__(self): """ implement me """ self.db = PostgresqlDb(**DATABASE_CONNECTION['gazetteer']) #PostgresqlDb.DEBUG=True self.db.connect() self.db2 = PostgresqlDb(**DATABASE_CONNECTION['geo_mapping']) self.db2.connect() @MemoryCached def getGeoNameFromContentID(self, content_id): """ returns the location of the content ID @param content_id @return list of locaions, e.g. ['Europa', 'France', 'Centre'] """ query = self.QUERY_CONTENT_ID % content_id result = self.db2.query(query) if result == []: return 'ContentID not found!' else: gaz_id = result[0]['gazetteer_id'] return Gazetteer.getGeoNameFromGeoId(self, gaz_id) ## returns the location of the GazetteerEntry ID # @param gazetteer-entry ID # @return list of locations, e.g. ['Europa', 'France', 'Centre'] @MemoryCached def getGeoNameFromGeoId(self, gazetteer_id): result = self.__getLocationTree(gazetteer_id) result.reverse() if result == []: return 'GazetteerID not found!' else: return result @MemoryCached def getGeoNameFromString(self, name): """ returns the geoname for the given string @param string @return a list of tuples (population, location) """ res = set() query = '''SELECT entity_id, population FROM gazetteerentry JOIN hasname ON (gazetteerentry.id = hasname.entry_id) JOIN gazetteerentity ON (gazetteerentity.id=hasname.entity_id) WHERE name = '%%s' AND (population > %d or feature_code in ('ADM1', 'ADM2', 'ADM3')) ''' % ( MIN_POPULATION) for result in self.db.query(query % name.replace("'", "''")): try: tmp = Gazetteer.getGeoNameFromGeoId(self, result['entity_id']) res.add((result['population'], tuple(tmp))) except GazetteerEntryNotFound: pass return list(res) ## recursive function to build the full location tree # @param id # @returns list of locations def __getLocationTree(self, id): geoPath = [self.__getPreferredGeoName(id)] geoIdPath = [id] while id: parentLocationEntity = self.__hasParent(id) if parentLocationEntity: parentLocationName = self.__getPreferredGeoName( parentLocationEntity) if parentLocationEntity in geoIdPath: print "%s in %s" % (parentLocationName, geoPath) break geoPath.append(parentLocationName) geoIdPath.append(parentLocationEntity) id = parentLocationEntity return geoPath ## gets the preferred name for the location # @param id # @returns preferred name def __getPreferredGeoName(self, id): """ returns the preferred entry name for the given entity id @param[in] entity_id @returns the geo entity's name """ query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d ORDER BY (lang='en' and lang is not null) DESC, (short=TRUE and short IS NOT NULL) DESC, preferred DESC LIMIT 1''' % (id) result = self.db.query(query) if not result: raise GazetteerEntryNotFound(id, query) return Gazetteer.DEBUG and result[0]['name'] + "(%d)" % (id) or result[ 0]['name'] ## checks if the given ID has a parent # @param ID of the child # @return false or ID of the parent def __hasParent(self, child_id): query = self.QUERY_HAS_PARENT % child_id result = self.db.query(query) # todo: is it necessary, that this functions can process multiple parents? # multiple parents (!) if result.__len__() > 1: print '### result > 1 ###' print ' child_id: %s' % child_id print ' parent_id: %s ' % [e['parent_id'] for e in result] # todo: does it make sense to fetch infinite loops if result == []: return 0 else: return result[0]['parent_id'] @MemoryCached def __getNameGeoId(self, name): """ returns the possible geoids for the given name @param[in] name @returns a list of geoids """ query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''")) return [r['entity_id'] for r in self.db.query(query)] def getGeoIdFromGeoUrl(self, geoUrl): """ returns the geoId forv the given geoUrl @param[in] geoUrl String or List containing the geoUrl @returns the geoId """ if isinstance(geoUrl, str): geoUrl = geoUrl.split("/") join = [] where = [] for nr, name in enumerate(geoUrl): join.append( "JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % (nr, nr, nr, nr + 1, nr + 1, nr)) where.append( "A%d.id IN (%s)" % (nr, ", ".join(map(str, self.__getNameGeoId(self, name))))) query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % ( nr, " ".join(join[:-1]), " AND ".join(where)) return [int(r['id']) for r in self.db.query(query)] def getGeoDict(self, geoId): """ returns a dictinary with all information about the given geoId """ print geoId query = "SELECT * FROM gazetteerentity WHERE id = %s" % geoId res = self.db.query(query) if len(res) > 0: return dict(res[0]) else: return {}
class Gazetteer(object): # sorting by population is a workaround for entries with multiple parents # (without the sorting loops occure) QUERY_HAS_PARENT = ''' SELECT parent_id FROM gazetteerentity ga JOIN locatedin ON (ga.id = locatedin.child_id) JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id) WHERE child_id = %d order by gb.population DESC LIMIT 1''' DEBUG = False def __init__(self): """ initializes the gazetteer object and the database connections """ self.db = PostgresqlDb( **DATABASE_CONNECTION['gazetteer'] ) self.db.connect() self.db2 = PostgresqlDb( **DATABASE_CONNECTION['geo_mapping'] ) self.db2.connect() def getGeoEntityDict(self, name=None, id=None, geoUrl=None): """ returns a list of GeoEntities matching the given information @param[in] name of the Entity @param[in] id the GeoNames id @param[in] geoUrl the entity's dictionary """ geoId = [] if name: geoId.extend( self.getIdFromName(self, name) ) if id: geoId.append( id ) if geoUrl: geoId.extend( self.getIdFromGeoUrl( geoUrl ) ) return self.getGeoEntityDictFromId( geoId ) @MemoryCached def getIdFromName(self, name): """ returns the possible GeoNames ids for the given name @param[in] name @returns a list of GeoNames ids """ query = "SELECT id FROM vw_gazetteer WHERE name='%s' ORDER BY population DESC" % ( name.replace("'", "''") ) res = [ r['id'] for r in self.db.query( query ) ] #query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''") ) return [ r['id'] for r in self.db.query( query ) ] def getIdFromGeoUrl(self, geoUrl): """ returns the geoId for the given geoUrl @param[in] geoUrl @returns a list of geonames ids matching the geoUrl """ geoUrl = geoUrl.split(GEO_ENTITY_SEPARATOR) join = [] where = [] for nr, name in enumerate( geoUrl ): join.append("JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % (nr, nr, nr, nr+1, nr+1, nr) ) where.append( "A%d.id IN (%s)" % (nr, ", ".join( map(str, self.getIdFromName(self, name))) )) query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % ( nr, " ".join(join[:-1]), " AND ".join(where) ) return [ int(r['id']) for r in self.db.query( query ) ] # @MemoryCached def getGeoEntityDictFromId(self, id): """ returns the location of the GazetteerEntry ID @param id a list of geonames ids @return list of GeoEntities """ if id: q = "SELECT * FROM gazetteerentity LEFT JOIN countryInfo USING(id) WHERE id IN (%s)" % ", ".join( map(str, id )) res = self.db.query( q ) if len(res)>0: entities = [ dict(self._getResultById(res, i).items()) for i in id ] self._addGeoUrl( entities ) return entities print "WARNING: no entities found for ", ", ".join( map(str,id) ) return [] def _getResultById(self, l, id, idAttr="id"): """ returns the database column matching the given id @param[in] l the list of query results @param[in] id the row id to return @param[in] idAttr attribute to consider for the id """ return [ e for e in l if e[idAttr] == id ][0] def _addGeoUrl( self, entities ): """ adds the geoUrl and level key to the given list of entities """ for entity in entities: idUrl, nameUrl = self._getGeoUrl( entity['id'] ) entity['geoUrl'] = GEO_ENTITY_SEPARATOR.join(nameUrl) entity['idUrl'] = idUrl entity['level'] = len(nameUrl) # hierarchy level of the entity (e.g. eu>at => 2) def _getGeoUrl(self, id): """ returns the geoUrl for the given entity @param[in] the geonames gazetteer id @returns two lists containing (geoIdPath, geoNamePath) """ geoNamePath = [ self._getPreferredGeoName( id ) ] geoIdPath = [ id ] while id: parentLocationEntity = self._hasParent(id) if parentLocationEntity: parentLocationName = self._getPreferredGeoName( parentLocationEntity ) if parentLocationEntity in geoIdPath: print "%s in %s" % (parentLocationName, geoNamePath) break geoNamePath.append( parentLocationName ) geoIdPath.append( parentLocationEntity ) id = parentLocationEntity geoIdPath.reverse() geoNamePath.reverse() return (geoIdPath[1:], geoNamePath) ## gets the preferred name for the location # @param id # @returns preferred name def _getPreferredGeoName(self, id): """ returns the preferred entry name for the given entity id @param[in] entity_id @returns the geo entity's name """ query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d ORDER BY (lang='en') DESC, (lang IS NULL) DESC, (lang = '') DESC, (short=TRUE and short IS NOT NULL) DESC, preferred DESC LIMIT 1''' % (id) result = self.db.query(query) if not result: raise GazetteerEntryNotFound(id, query) return Gazetteer.DEBUG and result[0]['name']+"(%d)" % (id) or result[0]['name'] ## checks if the given ID has a parent # @param ID of the child # @return false or ID of the parent def _hasParent(self, child_id): query = self.QUERY_HAS_PARENT % child_id result = self.db.query(query) # todo: is it necessary, that this functions can process multiple parents? # multiple parents (!) if result.__len__() > 1: print '### result > 1 ###' print ' child_id: %s' % child_id print ' parent_id: %s ' % [ e['parent_id'] for e in result ] # todo: does it make sense to fetch infinite loops if result == []: return 0 else: return result[0]['parent_id']