Example #1
0
 def __init__(self):
     """ implement me """
     self.db = PostgresqlDb( **DATABASE_CONNECTION['gazetteer'] )
     #PostgresqlDb.DEBUG=True
     self.db.connect()
     self.db2 = PostgresqlDb( **DATABASE_CONNECTION['geo_mapping'] )
     self.db2.connect()
Example #2
0
 def __init__(self):
     """ implement me """
     self.db = PostgresqlDb(**DATABASE_CONNECTION['gazetteer'])
     #PostgresqlDb.DEBUG=True
     self.db.connect()
     self.db2 = PostgresqlDb(**DATABASE_CONNECTION['geo_mapping'])
     self.db2.connect()
Example #3
0
 def __init__(self):
     """ initializes the gazetteer object and the database connections  """
     self.db = PostgresqlDb(**DATABASE_CONNECTION['gazetteer'])
     self.db.connect()
     self.db2 = PostgresqlDb(**DATABASE_CONNECTION['geo_mapping'])
     self.db2.connect()
Example #4
0
class Gazetteer(object):
    # sorting by population is a workaround for entries with multiple parents
    # (without the sorting loops occure)
    QUERY_HAS_PARENT = '''
        SELECT parent_id 
        FROM gazetteerentity ga 
        JOIN locatedin ON (ga.id = locatedin.child_id)
        JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id)
        WHERE child_id = %d order by gb.population DESC LIMIT 1'''

    DEBUG = False

    def __init__(self):
        """ initializes the gazetteer object and the database connections  """
        self.db = PostgresqlDb(**DATABASE_CONNECTION['gazetteer'])
        self.db.connect()
        self.db2 = PostgresqlDb(**DATABASE_CONNECTION['geo_mapping'])
        self.db2.connect()

    def getGeoEntityDict(self, name=None, id=None, geoUrl=None):
        """ returns a list of GeoEntities matching the given information
            @param[in] name   of the Entity
            @param[in] id     the GeoNames id 
            @param[in] geoUrl the entity's dictionary
        """
        geoId = []
        if name:
            geoId.extend(self.getIdFromName(self, name))
        if id:
            geoId.append(id)
        if geoUrl:
            geoId.extend(self.getIdFromGeoUrl(geoUrl))
        return self.getGeoEntityDictFromId(geoId)

    @MemoryCached
    def getIdFromName(self, name):
        """ returns the possible GeoNames ids for the given name 
            @param[in] name
            @returns a list of GeoNames ids
        """
        query = "SELECT id FROM vw_gazetteer WHERE name='%s' ORDER BY population DESC" % (
            name.replace("'", "''"))
        res = [r['id'] for r in self.db.query(query)]
        #query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''") )
        return [r['id'] for r in self.db.query(query)]

    def getIdFromGeoUrl(self, geoUrl):
        """ returns the geoId for the given geoUrl 
            @param[in] geoUrl 
            @returns a list of geonames ids matching the geoUrl
        """
        geoUrl = geoUrl.split(GEO_ENTITY_SEPARATOR)

        join = []
        where = []
        for nr, name in enumerate(geoUrl):
            join.append("JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % (
                nr, nr, nr, nr + 1, nr + 1, nr))
            where.append("A%d.id IN (%s)" % (nr, ", ".join(
                map(str, self.getIdFromName(self, name)))))

        query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % (
            nr, " ".join(join[:-1]), " AND ".join(where))
        return [int(r['id']) for r in self.db.query(query)]

    # @MemoryCached
    def getGeoEntityDictFromId(self, id):
        """ returns the location of the GazetteerEntry ID  
            @param id a list of geonames ids
            @return list of GeoEntities
        """
        if id:
            q = "SELECT * FROM gazetteerentity LEFT JOIN countryInfo USING(id) WHERE id IN (%s)" % ", ".join(
                map(str, id))
            res = self.db.query(q)
            if len(res) > 0:
                entities = [dict(list(self._getResultById(res, i).items()))
                            for i in id]
                self._addGeoUrl(entities)
                return entities

            print("WARNING: no entities found for ", ", ".join(map(str, id)))

        return []

    def _getResultById(self, l, id, idAttr="id"):
        """ returns the database column matching the given id 
            @param[in] l      the list of query results
            @param[in] id     the row id to return
            @param[in] idAttr attribute to consider for the id
        """
        return [e for e in l if e[idAttr] == id][0]

    def _addGeoUrl(self, entities):
        """ adds the geoUrl and level key to the given list of entities """
        for entity in entities:
            idUrl, nameUrl = self._getGeoUrl(entity['id'])
            entity['geoUrl'] = GEO_ENTITY_SEPARATOR.join(nameUrl)
            entity['idUrl'] = idUrl
            # hierarchy level of the entity (e.g. eu>at => 2)
            entity['level'] = len(nameUrl)

    def _getGeoUrl(self, id):
        """ returns the geoUrl for the given entity 
            @param[in] the geonames gazetteer id 
            @returns   two lists containing (geoIdPath, geoNamePath) 
        """
        geoNamePath = [self._getPreferredGeoName(id)]
        geoIdPath = [id]

        while id:
            parentLocationEntity = self._hasParent(id)
            if parentLocationEntity:
                parentLocationName = self._getPreferredGeoName(
                    parentLocationEntity)
                if parentLocationEntity in geoIdPath:
                    print("%s in %s" % (parentLocationName, geoNamePath))
                    break
                geoNamePath.append(parentLocationName)

            geoIdPath.append(parentLocationEntity)
            id = parentLocationEntity

        geoIdPath.reverse()
        geoNamePath.reverse()
        return (geoIdPath[1:], geoNamePath)

    # gets the preferred name for the location
    # @param id
    # @returns preferred name
    def _getPreferredGeoName(self, id):
        """ returns the preferred entry name for the given
            entity id
            @param[in] entity_id 
            @returns the geo entity's name
        """

        query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d 
                      ORDER BY 
                        (lang='en') DESC, (lang IS NULL) DESC, (lang = '') DESC,
                        (short=TRUE and short IS NOT NULL) DESC,
                        preferred DESC
                      LIMIT 1''' % (id)
        result = self.db.query(query)
        if not result:
            raise GazetteerEntryNotFound(id, query)

        return Gazetteer.DEBUG and result[0]['name'] + "(%d)" % (id) or result[0]['name']

    # checks if the given ID has a parent
    # @param ID of the child
    # @return false or ID of the parent
    def _hasParent(self, child_id):
        query = self.QUERY_HAS_PARENT % child_id
        result = self.db.query(query)

        # todo: is it necessary, that this functions can process multiple parents?
        # multiple parents (!)
        if result.__len__() > 1:
            print('### result > 1 ###')
            print('    child_id:  %s' % child_id)
            print('    parent_id: %s ' % [e['parent_id'] for e in result])

        # todo: does it make sense to fetch infinite loops
        if result == []:
            return 0
        else:
            return result[0]['parent_id']
Example #5
0
 def __init__(self):
     """ open the database connection """
     dbParam = DATABASE_CONNECTION['wikipedia'].copy()
     dbParam['connect'] = False
     self.db = PostgresqlDb(**dbParam)
Example #6
0
#!/usr/bin/env python

import os.path
from eWRT.access.db import PostgresqlDb
from eWRT.util.cache import DiskCached
from geoTEF.input import CSVResultCollection, SigCSVResultCollection
from csv import reader, writer
from glob import glob
from operator import itemgetter
from gzip import GzipFile

DELTA_DIR = "./delta"
TOP_EVAL_IDS = 25
GEO_EVAL_DB = PostgresqlDb("geoEval", "xmdimrill.ai.wu.ac.at", "geoEval",
                           "x12o21c")


@DiskCached(".get_corpora-Cache")
def get_corpus(corpusUrl):
    """ retrieves the given corpus using the reader
        specified in the corpusUrl

        @param[in] corpusUrl e.g. CSVResultCollection:/home/..../reuters.csv
        @returns the corpus as dictionary
    """
    assert ":" in corpusUrl
    readerClass, path = corpusUrl.split(":", 1)

    reader = eval(readerClass)()
    if path.endswith(".gz"):
        reader.read(GzipFile(path))
Example #7
0
class Gazetteer(object):
    # sorting by population is a workaround for entries with multiple parents
    # (without the sorting loops occure)
    QUERY_HAS_PARENT = '''
        SELECT parent_id 
        FROM gazetteerentity ga 
        JOIN locatedin ON (ga.id = locatedin.child_id)
        JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id)
        WHERE child_id = %d order by gb.population DESC LIMIT 1'''

    QUERY_CONTENT_ID = '''
        SELECT gazetteer_id FROM content_id_gazeteer_id WHERE content_id = %d '''

    QUERY_NAME = '''
            SELECT entity_id, ispreferredname, lang, gazetteerentry_id
            FROM gazetteerentry_ordered_names
            WHERE name LIKE '%s' '''

    DEBUG = False

    ## init - establishes the db-connections
    def __init__(self):
        """ implement me """
        self.db = PostgresqlDb(**DATABASE_CONNECTION['gazetteer'])
        #PostgresqlDb.DEBUG=True
        self.db.connect()
        self.db2 = PostgresqlDb(**DATABASE_CONNECTION['geo_mapping'])
        self.db2.connect()

    @MemoryCached
    def getGeoNameFromContentID(self, content_id):
        """ returns the location of the content ID
            @param content_id
            @return list of locaions, e.g. ['Europa', 'France', 'Centre']
        """

        query = self.QUERY_CONTENT_ID % content_id
        result = self.db2.query(query)

        if result == []:
            return 'ContentID not found!'
        else:
            gaz_id = result[0]['gazetteer_id']
            return Gazetteer.getGeoNameFromGeoId(self, gaz_id)

    ## returns the location of the GazetteerEntry ID
    # @param gazetteer-entry ID
    # @return list of locations, e.g. ['Europa', 'France', 'Centre']
    @MemoryCached
    def getGeoNameFromGeoId(self, gazetteer_id):
        result = self.__getLocationTree(gazetteer_id)
        result.reverse()

        if result == []:
            return 'GazetteerID not found!'
        else:
            return result

    @MemoryCached
    def getGeoNameFromString(self, name):
        """ returns the geoname for the given string
            @param string
            @return a list of tuples (population, location) 
        """
        res = set()
        query = '''SELECT entity_id, population FROM gazetteerentry JOIN hasname ON (gazetteerentry.id = hasname.entry_id) 
                  JOIN gazetteerentity ON (gazetteerentity.id=hasname.entity_id) WHERE name = '%%s' AND (population > %d or feature_code in ('ADM1', 'ADM2', 'ADM3')) ''' % (
            MIN_POPULATION)
        for result in self.db.query(query % name.replace("'", "''")):
            try:
                tmp = Gazetteer.getGeoNameFromGeoId(self, result['entity_id'])
                res.add((result['population'], tuple(tmp)))
            except GazetteerEntryNotFound:
                pass

        return list(res)

    ## recursive function to build the full location tree
    # @param id
    # @returns list of locations
    def __getLocationTree(self, id):
        geoPath = [self.__getPreferredGeoName(id)]
        geoIdPath = [id]

        while id:
            parentLocationEntity = self.__hasParent(id)
            if parentLocationEntity:
                parentLocationName = self.__getPreferredGeoName(
                    parentLocationEntity)
                if parentLocationEntity in geoIdPath:
                    print "%s in %s" % (parentLocationName, geoPath)
                    break
                geoPath.append(parentLocationName)

            geoIdPath.append(parentLocationEntity)
            id = parentLocationEntity

        return geoPath

    ## gets the preferred name for the location
    # @param id
    # @returns preferred name
    def __getPreferredGeoName(self, id):
        """ returns the preferred entry name for the given
            entity id
            @param[in] entity_id 
            @returns the geo entity's name
        """

        query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d 
                      ORDER BY 
                            (lang='en' and lang is not null) DESC,
                            (short=TRUE and short IS NOT NULL) DESC,
                            preferred DESC
                      LIMIT 1''' % (id)
        result = self.db.query(query)
        if not result:
            raise GazetteerEntryNotFound(id, query)

        return Gazetteer.DEBUG and result[0]['name'] + "(%d)" % (id) or result[
            0]['name']

    ## checks if the given ID has a parent
    # @param ID of the child
    # @return false or ID of the parent
    def __hasParent(self, child_id):
        query = self.QUERY_HAS_PARENT % child_id
        result = self.db.query(query)

        # todo: is it necessary, that this functions can process multiple parents?
        # multiple parents (!)
        if result.__len__() > 1:
            print '### result > 1 ###'
            print '    child_id:  %s' % child_id
            print '    parent_id: %s ' % [e['parent_id'] for e in result]

        # todo: does it make sense to fetch infinite loops
        if result == []:
            return 0
        else:
            return result[0]['parent_id']

    @MemoryCached
    def __getNameGeoId(self, name):
        """ returns the possible geoids for the given name 
            @param[in] name
            @returns a list of geoids
        """
        query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % (
            name.replace("'", "''"))
        return [r['entity_id'] for r in self.db.query(query)]

    def getGeoIdFromGeoUrl(self, geoUrl):
        """ returns the geoId forv the given geoUrl 
            @param[in] geoUrl String or List containing the geoUrl
            @returns the geoId
        """
        if isinstance(geoUrl, str):
            geoUrl = geoUrl.split("/")

        join = []
        where = []
        for nr, name in enumerate(geoUrl):
            join.append(
                "JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)"
                % (nr, nr, nr, nr + 1, nr + 1, nr))
            where.append(
                "A%d.id IN (%s)" %
                (nr, ", ".join(map(str, self.__getNameGeoId(self, name)))))

        query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % (
            nr, " ".join(join[:-1]), " AND ".join(where))
        return [int(r['id']) for r in self.db.query(query)]

    def getGeoDict(self, geoId):
        """ returns a dictinary with all information about the given geoId """
        print geoId
        query = "SELECT * FROM gazetteerentity WHERE id = %s" % geoId
        res = self.db.query(query)
        if len(res) > 0:
            return dict(res[0])
        else:
            return {}
#!/usr/bin/env python
""" config - contains all important file locations """

__Revision__ = "$Id$"

import os.path
from warnings import warn

# geotagger database connection
try:
    from eWRT.access.db import PostgresqlDb
    DBNAME = ""
    DBHOST = ""
    DBUSER = ""
    DBPASS = ""
    GEO_DB_HANDLE = PostgresqlDb(DBNAME, DBHOST, DBUSER, DBPASS)
except ImportError:
    warn("Warning: Cannot import postgresql modules")

CACHE_DIR_LOCATION_REF = os.path.join(os.path.dirname(__file__), "cache",
                                      "LocationReference")
REUTERS_GOLDSTD_REF = os.path.join(os.path.dirname(__file__), "data",
                                   "goldstd_reuters.csv")

# $Id$
Example #9
0
class Gazetteer(object):
    # sorting by population is a workaround for entries with multiple parents
    # (without the sorting loops occure)
    QUERY_HAS_PARENT = '''
        SELECT parent_id 
        FROM gazetteerentity ga 
        JOIN locatedin ON (ga.id = locatedin.child_id)
        JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id)
        WHERE child_id = %d order by gb.population DESC LIMIT 1'''

    QUERY_CONTENT_ID = '''
        SELECT gazetteer_id FROM content_id_gazeteer_id WHERE content_id = %d '''

    QUERY_NAME = '''
            SELECT entity_id, ispreferredname, lang, gazetteerentry_id
            FROM gazetteerentry_ordered_names
            WHERE name LIKE '%s' '''

    DEBUG = False

    ## init - establishes the db-connections
    def __init__(self):
        """ implement me """
        self.db = PostgresqlDb( **DATABASE_CONNECTION['gazetteer'] )
        #PostgresqlDb.DEBUG=True
        self.db.connect()
        self.db2 = PostgresqlDb( **DATABASE_CONNECTION['geo_mapping'] )
        self.db2.connect()

    @MemoryCached
    def getGeoNameFromContentID(self, content_id):
        """ returns the location of the content ID
            @param content_id
            @return list of locaions, e.g. ['Europa', 'France', 'Centre']
        """

        query = self.QUERY_CONTENT_ID % content_id 
        result = self.db2.query(query)

        if result == []:
            return 'ContentID not found!'
        else:
            gaz_id = result[0]['gazetteer_id']
            return Gazetteer.getGeoNameFromGeoId(self, gaz_id)

    ## returns the location of the GazetteerEntry ID  
    # @param gazetteer-entry ID  
    # @return list of locations, e.g. ['Europa', 'France', 'Centre']
    @MemoryCached
    def getGeoNameFromGeoId(self, gazetteer_id):
        result = self.__getLocationTree(gazetteer_id)
        result.reverse()

        if result == []:
            return 'GazetteerID not found!'
        else:
            return result

    @MemoryCached
    def getGeoNameFromString(self, name):
        """ returns the geoname for the given string
            @param string
            @return a list of tuples (population, location) 
        """
        res = set()
        query = '''SELECT entity_id, population FROM gazetteerentry JOIN hasname ON (gazetteerentry.id = hasname.entry_id) 
                  JOIN gazetteerentity ON (gazetteerentity.id=hasname.entity_id) WHERE name = '%%s' AND (population > %d or feature_code in ('ADM1', 'ADM2', 'ADM3')) ''' % ( MIN_POPULATION)
        for result in self.db.query(query % name.replace("'", "''")):
            try:
                tmp = Gazetteer.getGeoNameFromGeoId(self, result['entity_id'])
                res.add( (result['population'], tuple(tmp)) )
            except GazetteerEntryNotFound:
                pass

        return list(res)


    ## recursive function to build the full location tree
    # @param id
    # @returns list of locations
    def __getLocationTree(self, id):
        geoPath = [ self.__getPreferredGeoName( id ) ]
        geoIdPath = [ id ]

        while id:
            parentLocationEntity = self.__hasParent(id)
            if parentLocationEntity:
                parentLocationName = self.__getPreferredGeoName( parentLocationEntity )
                if parentLocationEntity in geoIdPath:
                    print "%s in %s" % (parentLocationName, geoPath)
                    break
                geoPath.append( parentLocationName )

            geoIdPath.append( parentLocationEntity )
            id = parentLocationEntity

        return geoPath


    ## gets the preferred name for the location
    # @param id
    # @returns preferred name
    def __getPreferredGeoName(self, id):
        """ returns the preferred entry name for the given
            entity id
            @param[in] entity_id 
            @returns the geo entity's name
        """ 

        query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d 
                      ORDER BY 
                            (lang='en' and lang is not null) DESC,
                            (short=TRUE and short IS NOT NULL) DESC,
                            preferred DESC
                      LIMIT 1''' % (id)
        result = self.db.query(query)
        if not result:
            raise GazetteerEntryNotFound(id, query)

        return Gazetteer.DEBUG and result[0]['name']+"(%d)" % (id) or result[0]['name']

    ## checks if the given ID has a parent
    # @param ID of the child
    # @return false or ID of the parent 
    def __hasParent(self, child_id):
        query = self.QUERY_HAS_PARENT % child_id
        result = self.db.query(query)
        
        # todo: is it necessary, that this functions can process multiple parents?
        # multiple parents (!)
        if result.__len__() > 1:
            print '### result > 1 ###'
            print '    child_id:  %s' % child_id
            print '    parent_id: %s ' % [ e['parent_id'] for e in result ]


        # todo: does it make sense to fetch infinite loops
        if result == []:
            return 0 
        else:
            return result[0]['parent_id']

    @MemoryCached
    def __getNameGeoId(self, name):
        """ returns the possible geoids for the given name 
            @param[in] name
            @returns a list of geoids
        """
        query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''") )
        return [ r['entity_id'] for r in self.db.query( query ) ]


    def getGeoIdFromGeoUrl(self, geoUrl):
        """ returns the geoId forv the given geoUrl 
            @param[in] geoUrl String or List containing the geoUrl
            @returns the geoId
        """
        if isinstance(geoUrl, str):
            geoUrl = geoUrl.split("/")

        join  = []
        where = []
        for nr, name in enumerate( geoUrl ):
            join.append("JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % (nr, nr, nr, nr+1, nr+1, nr) )
            where.append( "A%d.id IN (%s)" % (nr, ", ".join( map(str, self.__getNameGeoId(self, name))) ))

        query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % ( nr, " ".join(join[:-1]), " AND ".join(where) )
        return [ int(r['id']) for r in self.db.query( query ) ]

    def getGeoDict(self, geoId):
        """ returns a dictinary with all information about the given geoId """
        print geoId
        query = "SELECT * FROM gazetteerentity WHERE id = %s" % geoId
        res = self.db.query( query )
        if len(res)>0:
            return dict(res[0])
        else:
            return {}
Example #10
0
 def __init__(self):
     """ initializes the gazetteer object and the database connections  """
     self.db = PostgresqlDb( **DATABASE_CONNECTION['gazetteer'] )
     self.db.connect()
     self.db2 = PostgresqlDb( **DATABASE_CONNECTION['geo_mapping'] )
     self.db2.connect()
Example #11
0
class Gazetteer(object):
    # sorting by population is a workaround for entries with multiple parents
    # (without the sorting loops occure)
    QUERY_HAS_PARENT = '''
        SELECT parent_id 
        FROM gazetteerentity ga 
        JOIN locatedin ON (ga.id = locatedin.child_id)
        JOIN gazetteerentity gb ON (gb.id = locatedin.parent_id)
        WHERE child_id = %d order by gb.population DESC LIMIT 1'''

    DEBUG = False

    def __init__(self):
        """ initializes the gazetteer object and the database connections  """
        self.db = PostgresqlDb( **DATABASE_CONNECTION['gazetteer'] )
        self.db.connect()
        self.db2 = PostgresqlDb( **DATABASE_CONNECTION['geo_mapping'] )
        self.db2.connect()

    def getGeoEntityDict(self, name=None, id=None, geoUrl=None):
        """ returns a list of GeoEntities matching the given information
            @param[in] name   of the Entity
            @param[in] id     the GeoNames id 
            @param[in] geoUrl the entity's dictionary
        """
        geoId = []
        if name:
            geoId.extend( self.getIdFromName(self, name) )
        if id:
            geoId.append( id )
        if geoUrl:
            geoId.extend( self.getIdFromGeoUrl( geoUrl ) )
        return self.getGeoEntityDictFromId( geoId )

    @MemoryCached
    def getIdFromName(self, name):
        """ returns the possible GeoNames ids for the given name 
            @param[in] name
            @returns a list of GeoNames ids
        """
        query = "SELECT id FROM vw_gazetteer WHERE name='%s' ORDER BY population DESC" % ( name.replace("'", "''") )
        res = [ r['id'] for r in self.db.query( query ) ]
        #query = "SELECT DISTINCT entity_id FROM vw_entry_id_has_name WHERE name='%s'" % ( name.replace("'", "''") )
        return [ r['id'] for r in self.db.query( query ) ]

    def getIdFromGeoUrl(self, geoUrl):
        """ returns the geoId for the given geoUrl 
            @param[in] geoUrl 
            @returns a list of geonames ids matching the geoUrl
        """
        geoUrl = geoUrl.split(GEO_ENTITY_SEPARATOR)

        join  = []
        where = []
        for nr, name in enumerate( geoUrl ):
            join.append("JOIN locatedin L%d ON (A%d.id = L%d.parent_id) JOIN gazetteerentity A%d ON (A%d.id = L%d.child_id)" % (nr, nr, nr, nr+1, nr+1, nr) )
            where.append( "A%d.id IN (%s)" % (nr, ", ".join( map(str, self.getIdFromName(self, name))) ))

        query = "SELECT A%d.id AS id FROM gazetteerentity A0 %s WHERE %s;" % ( nr, " ".join(join[:-1]), " AND ".join(where) )
        return [ int(r['id']) for r in self.db.query( query ) ]

    # @MemoryCached
    def getGeoEntityDictFromId(self, id):
        """ returns the location of the GazetteerEntry ID  
            @param id a list of geonames ids
            @return list of GeoEntities
        """
        if id:
            q = "SELECT * FROM gazetteerentity LEFT JOIN countryInfo USING(id) WHERE id IN (%s)" % ", ".join( map(str, id ))
            res = self.db.query( q ) 
            if len(res)>0:
                entities = [ dict(self._getResultById(res, i).items()) for i in id ]
                self._addGeoUrl( entities )
                return entities

            print "WARNING: no entities found for ", ", ".join( map(str,id) )

        return []

    def _getResultById(self, l, id, idAttr="id"):
        """ returns the database column matching the given id 
            @param[in] l      the list of query results
            @param[in] id     the row id to return
            @param[in] idAttr attribute to consider for the id
        """
        return [ e for e in l if e[idAttr] == id ][0]


    def _addGeoUrl( self, entities ):
        """ adds the geoUrl and level key to the given list of entities """
        for entity in entities:
            idUrl, nameUrl = self._getGeoUrl( entity['id'] )
            entity['geoUrl'] = GEO_ENTITY_SEPARATOR.join(nameUrl)
            entity['idUrl']  = idUrl
            entity['level']  = len(nameUrl)                         # hierarchy level of the entity (e.g. eu>at => 2)

    def _getGeoUrl(self, id):
        """ returns the geoUrl for the given entity 
            @param[in] the geonames gazetteer id 
            @returns   two lists containing (geoIdPath, geoNamePath) 
        """
        geoNamePath = [ self._getPreferredGeoName( id ) ]
        geoIdPath = [ id ]

        while id:
            parentLocationEntity = self._hasParent(id)
            if parentLocationEntity:
                parentLocationName = self._getPreferredGeoName( parentLocationEntity )
                if parentLocationEntity in geoIdPath:
                    print "%s in %s" % (parentLocationName, geoNamePath)
                    break
                geoNamePath.append( parentLocationName )

            geoIdPath.append( parentLocationEntity )
            id = parentLocationEntity

        geoIdPath.reverse()
        geoNamePath.reverse()
        return (geoIdPath[1:], geoNamePath)


    ## gets the preferred name for the location
    # @param id
    # @returns preferred name
    def _getPreferredGeoName(self, id):
        """ returns the preferred entry name for the given
            entity id
            @param[in] entity_id 
            @returns the geo entity's name
        """ 

        query = '''SELECT name FROM vw_gazetteer_tng WHERE id=%d 
                      ORDER BY 
                        (lang='en') DESC, (lang IS NULL) DESC, (lang = '') DESC,
                        (short=TRUE and short IS NOT NULL) DESC,
                        preferred DESC
                      LIMIT 1''' % (id)
        result = self.db.query(query)
        if not result:
            raise GazetteerEntryNotFound(id, query)

        return Gazetteer.DEBUG and result[0]['name']+"(%d)" % (id) or result[0]['name']


    ## checks if the given ID has a parent
    # @param ID of the child
    # @return false or ID of the parent 
    def _hasParent(self, child_id):
        query = self.QUERY_HAS_PARENT % child_id
        result = self.db.query(query)
        
        # todo: is it necessary, that this functions can process multiple parents?
        # multiple parents (!)
        if result.__len__() > 1:
            print '### result > 1 ###'
            print '    child_id:  %s' % child_id
            print '    parent_id: %s ' % [ e['parent_id'] for e in result ]


        # todo: does it make sense to fetch infinite loops
        if result == []:
            return 0 
        else:
            return result[0]['parent_id']