def best_guess(self, city=None, admin1=None, country=None, res_level=RESOLUTION_LEVEL.city): """Makes a best guess with for a given city, country, admin1 tuple :param city: city name :param country: country name :param admin1: admin1, state or province name :returns: ([canonical LT], [alias LT]) city/admin/country level geocoded tuple with lat-lon LT = (city, country, admin1, admin2, admin3, pop, lat, lon, id, pad) NOTE: lat-lon corresponds to that of city/capital of admin1 or country """ ci, a1, co = (city and nstr(city), admin1 and nstr(admin1), country and nstr(country)) if co and co in self.co_aliases: country = self.co_aliases[co] co = nstr(self.co_aliases[co]) # ref: hack#1 in features list if a1 and co and not ci: if (co, a1) in self.same_ci_a1_name: ci = nstr(self.same_ci_a1_name[(co, a1)]) if co and not a1 and not ci and co in self.co_names: return self.best_guess_country(co) elif a1 and co and not ci: return self.best_guess_admin1(a1, co) elif ci: return self.best_guess_city(ci, co, a1, res_level) else: return ([], [])
def best_guess_city(self, city, country=None, admin1=None, res_level=RESOLUTION_LEVEL.city): '''Resolve city :param admin1: norm str of admin1 name :param country (optional): norm str of country name :param admin1 (optional): norm str of admin1 name :param res_level (optional): reolution level :returns: ([canonical LT], [alias LT]) city/admin/country level geocoded tuple with lat-lon LT = (city, country, admin1, admin2, admin3, pop, lat, lon, id, pad) ''' ci = nstr(city.strip()) co = nstr(country.strip()) if country else None a1 = nstr(admin1.strip()) if admin1 else None canonical_name_bgs, canonical_a_match = self._best_guess_city(ci, co, a1) all_bgs = copy.deepcopy(canonical_name_bgs) alias_name_bgs, loc = set(), set() alias_a_match = False k = (ci, co) if k in self.ci_aliases: if a1 and a1 in self.ci_aliases[k]: alias_a_match = True for o_ci in self.ci_aliases[k][a1]: loc.add(((nstr(o_ci), co), a1)) else: if self.debug and a1: self.log.info("Alias key (%s) exists but not admin1 %s" % (list(k), a1)) for adm1, o_cities in self.ci_aliases[k].items(): for o_ci in o_cities: loc.add(((nstr(o_ci), co), adm1)) #print loc for (ci, co), a in loc: alias_bgs, alias_a_found = self._best_guess_city(ci, co, a) for bg in alias_bgs: if bg not in all_bgs: all_bgs.add(bg) alias_name_bgs.add(bg) # sort by specific priority # TODO sort by tweet volume alias_name_bgs = sorted(alias_name_bgs, key=lambda x: x[5], reverse=True) canonical_name_bgs = sorted(canonical_name_bgs, key=lambda x: x[5], reverse=True) # if no results returned try resolving on either admin1 level first # if that fails do country level if len(canonical_name_bgs) == 0 and len(alias_name_bgs) == 0 and \ res_level > RESOLUTION_LEVEL.city: if co and a1: return self.best_guess_admin1(a1, co) elif co: return self.best_guess_country(co) if not canonical_a_match and alias_a_match: return (list(alias_name_bgs), []) else: return (list(canonical_name_bgs), list(alias_name_bgs))
def match_for_code(self, loc_str, city=None, admin1=None, country=None): """Extract city, admin1 and country names using regular expressions built using codes for country and admin1, which in turn are mapped to full name string. It uses as input a location information containing string, which primarily is the location string mentioned in twitter user's profile :param loc_str: location string, but could be any string containing geocodable information :param city: name of city :param admin1: name of admin1 :param country: name of country :returns: locstr, city, admin1, country NOTE: loc string returned is substring after removal of matched loc entities """ loc = None if loc_str: loc = loc_str # nstr(loc_str) for r, t in [('D. F.', 'DF'), ('.', ''), ('-', ' '), (', ', ' '), (',', ' '), ('#', ' ')]: loc = loc.replace(r, t) if not country: co_code = self.co_code_reg.search(loc) if co_code and co_code in self.co_code: loc = loc.replace(co_code, " ", 1) country = self.co_code[co_code] if not admin1: if not country: admin1_code = self.admin1_code_reg2.search(loc) else: admin1_code = self.admin1_code_reg1.search(loc) if admin1_code and admin1_code in self.admin_code: co_a = self.admin_code[admin1_code] co_a = [(nstr(c), a) for c, a in co_a if nstr(a) in self.admin_name and nstr(c) in self.admin_name[nstr(a)]] if len(co_a) > 0: x = None if country: co = nstr(country) x = lambda i: i == co elif len(co_a) == 1: x = lambda i: True else: # to be safe if country is not known # dont make guesses x = lambda i: False co_a = [(c, a) for c, a in co_a if x(c)] if len(co_a) > 0: c, a = co_a[0] admin1 = a if not country: country = c loc = (admin1_code and loc.replace(admin1_code, " ", 1)) or loc return loc, city, admin1, country
def get_all_cities(self, country, admin1): '''Return all cities for country & admin1 pair :param country: country name :param admin1: admin1 name :returns: list of city names ''' cities = set() co = nstr(country) a = nstr(admin1) for ci, co_a in self.bguess.items(): if co and co in co_a and a and a in co_a[co]: cities.add(self.data[co_a[co][a][0]][1][0]) return list(cities)
def match_for_names(self, loc_str, city=None, admin1=None, country=None): """Extract city, admin1 and country names using regular expressions using some string - which primarily is the location string mentioned in twitter user's profile :param loc_str: location string, but could be any string containing geocodable information :param city: name of city :param admin1: name of admin1 :param country: name of country :returns: locstr, city, admin1, country NOTE: loc string returned is substring after removal of matched loc entities """ loc = None if loc_str: loc = nstr(loc_str) loc, city, admin1, country = self._match_for_names(loc, city, admin1, country) for r, t in [('.', ''), ('-', ' '), (', ', ' '), (',', ' '), ('#', ' ')]: loc = loc.replace(r, t) loc, city, admin1, country = self._match_for_names(loc, city, admin1, country) return loc, city, admin1, country
def best_guess_admin1(self, admin1, country): '''Resolve country and admin1 :param admin1: norm str of admin1 name :param country: norm str of country name :returns: ([LT], []) admin1 level geocoded tuple with lat-lon LT = (city, country, admin1, admin2, admin3, pop, lat, lon, id, pad) NOTE: lat-lon corresponds to avg. of lat, lon points from each record in world-gazetteer for given country and admin pair ''' a1, co = nstr(admin1), nstr(country) if a1 in self.admin_name and co in self.admin_name[a1]: rid, lat, lon, code, admin_name, co_name = self.admin_name[a1][co] return ([(None, co_name, admin_name, None, None, None, lat, lon, rid, 0)], []) else: # if not found return country level geocoding return self.best_guess_country(co)
def best_guess_country(self, country): '''Resolve country :param country: norm str of country name :returns: ([LT], []) country level geocoded tuple with lat-lon LT = (city, country, admin1, admin2, admin3, pop, lat, lon, id, pad) NOTE: lat-lon corresponds to avg. of lat, lon points from each record in world-gazetteer for given country ''' co = nstr(country) if country else None if co and co in self.co_names: rid, lat, lon, code, co_name = self.co_names[co] return ([(None, co_name, None, None, None, None, lat, lon, rid, 0)], []) else: return ([], [])
def normalize_places(places): """Extract city, admin1 and country from places json object of tweet :returns: (city, admin1, country) """ city, admin1, country = [None] * 3 if places: if 'place_type' in places: if (places['place_type'] == 'admin' and 'name' in places): admin1 = nstr(places['name'].strip()) elif places['place_type'] == 'city': if 'name' in places: city = nstr(places['name'].strip()) if 'full_name' in places: ci_admin1 = places['full_name'].split(',') if len(ci_admin1) > 0: ci = ci_admin1[0] if city is None and ci and len(ci.strip()) > 0: city = nstr(ci.strip()) if len(ci_admin1) > 1: adm1 = ci_admin1[-1] if admin1 is None and len(adm1.strip()) > 0: admin1 = nstr(adm1.strip()) elif (places['place_type'] == 'poi' or places['place_type'] == 'neighborhood'): if 'full_name' in places: ci_admin1 = places['full_name'].split(',') if len(ci_admin1) > 1: adm1 = ci_admin1[-1] if admin1 is None and len(adm1.strip()) > 0: admin1 = nstr(adm1.strip()) if 'country' in places: country = nstr(places['country'].strip()) # fix for twitter's venezuela long name if "venezuela" in country: country = "venezuela" return city, admin1, country
def _best_guess_city(self, city, country=None, admin1=None): '''Resolve city :param admin1: norm str of admin1 name :param country (optional): norm str of country name :param admin1 (optional): norm str of admin1 name :param res_level (optional): reolution level :returns: ([canonical LT], canononical_admin_match) city/admin/country level geocoded tuple with lat-lon LT = (city, country, admin1, admin2, admin3, pop, lat, lon, id, pad) canonical_admin_match : T/F ''' ci = nstr(city.strip()) co = nstr(country.strip()) if country else None a1 = nstr(admin1.strip()) if admin1 else None # best-guess'es # aim is to first search through canonical names of city-country keys # and then also search alias ci names store # finaly we merge both results, where priority is given to records # results from canonical name searches canonical_name_bgs = set() canonical_a_match = False if ci in self.bguess: if co and co in self.bguess[ci]: if a1: # TODO add log in else if a1 in self.bguess[ci][co]: canonical_a_match = True canonical_name_bgs.add( self.data[self.bguess[ci][co][a1][0]][1]) else: canonical_name_bgs.update([self.data[indices[0]][1] for adm1, indices in self.bguess[ci][co].items()]) else: # pick if only one possible ci-co pair is present if len(self.bguess[ci]) == 1: co1 = self.bguess[ci].keys()[0] if a1: # if admin is provided # make sure its present in picked country if a1 in self.bguess[ci][co1]: canonical_a_match = True canonical_name_bgs.add( self.data[self.bguess[ci][co1][a1][0]][1]) else: if len(self.bguess[ci][co1]) == 1: canonical_name_bgs.update( [self.data[indices[0]][1] for adm1, indices in self.bguess[ci][co1].items()]) else: # do country level geocoding bg_co = self.best_guess_country(co1)[0] if bg_co: canonical_name_bgs.add(bg_co[0]) else: self.log.info( "Country not found CO:{0}".format(co1)) else: # if more than one country pair is present if a1: possible_co = set([co for co in self.bguess[ci].keys() if a1 in self.bguess[ci][co]]) if len(possible_co) == 1: co1 = list(possible_co)[0] canonical_name_bgs.add( self.data[self.bguess[ci][co1][a1][0]][1]) return canonical_name_bgs, canonical_a_match
def __init__(self, wg_data=WG_DATA, co_admin_data=CO_ADMIN_DATA, priority_policy=PRIORITY_POLICY, debug=False): """ """ self.priority_policy = priority_policy self.debug = debug self.__version__ = "{0}-{1}-{2}-{3}-{4}".format( self.__class__.__name__, __version__, hashlib.md5(get_wg_data(wg_data).read()).hexdigest(), hashlib.md5(get_co_admin_data(co_admin_data).read()).hexdigest(), hashlib.md5(" ".join(self.priority_policy)).hexdigest()) if self.debug: try: logs.init() except IOError: # , err: logs.init(logfile=self.__class__.__name__.lower()) self.log = logs.getLogger("{0}-{1}".format( self.__class__.__name__, __version__.replace('.', '_'))) # 1. load country and admin1 level geo data f = get_co_admin_data(co_admin_data) dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t") f.seek(0) reader = csv.DictReader(f, dialect=dialect, fieldnames=CO_ADMIN_FIELDS) # NOTE: # Known conflicts b/w codes of countries and other admins # co Colombia ('Colombia', 'C\xc3\xb3rdoba') # cl Chile ('Colombia', 'Caldas') # ar Argentina ('Colombia', 'Arauca') # sv El Salvador ('El Salvador', 'San Vicente') # prep lookup dictionaries # key__value # countries self.co_code = {} self.co_names = {} self.co_aliases = {} self.co_capital_cities = {} # admin1 self.admin_code = {} self.admin_name = {} # assumes countries appear first when reading data from # lac_co_admin TODO BAD! for r in reader: for k in r.keys(): r[k] = r[k].strip() lat = float_or_none(r['latitude']) lon = float_or_none(r['longitude']) code = object_or_none(r['iso_3166_code']) rid = int_or_none(r["id"]) if r['type'] == 'country': # country if code: self.co_code[code] = r['name'] self.co_names[nstr(r['name'])] = (rid, lat, lon, code, r['name']) self.co_capital_cities[nstr(r['capital_city'])] =\ (r['capital_city'], r['name']) aliases = r['alt_names'].split(',') self.co_aliases.update({nstr(alias.strip()): r['name'] for alias in aliases}) else: if self.debug: self.log.error("Bad data country {0} Code {1}".format( r['name'], code)) elif r['type'] == 'admin': # admin admin, co = r['full_name'].split(',') admin, co = admin.strip(), co.strip() if code: if code not in self.admin_code: self.admin_code[code] = [] self.admin_code[code].append((co, admin)) co1, a = nstr(co), nstr(admin) if a not in self.admin_name: self.admin_name[a] = {} if co1 not in self.admin_name[a]: self.admin_name[a][co1] = (rid, lat, lon, code, admin, co) f.close() # 2. load (world-gazeteer) city level geo data f = get_wg_data(wg_data) dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t") f.seek(0) reader = csv.DictReader(f, dialect=dialect, fieldnames=WG_FIELDS) self.ci_aliases = {} # main data store for geocoding self.data = [] counter = 0 ci_set = set() for r in reader: for k in r.keys(): r[k] = r[k].strip() # get alias names for cities ci_names = [a.strip() for a in r['alt_names'].split(',') if len(a.strip()) > 0] ci_names.extend([a.strip() for a in r['orig_names'].split(',') if len(a.strip()) > 0]) for ci in ci_names: k = (nstr(ci), nstr(r['country'])) a1 = nstr(r['admin1']) if k not in self.ci_aliases: self.ci_aliases[k] = {a1: set([r['name']])} elif a1 not in self.ci_aliases[k]: self.ci_aliases[k][a1] = set([r['name']]) else: # Cases where different cities for same # admin-country pair have the same alias self.ci_aliases[k][a1].add(r['name']) # add ci name aliases into ci_set ci_set.add(nstr(ci)) # store only cannonical cities names self.data.append((counter, (r['name'], r['country'], r['admin1'], object_or_none(r['admin2']), object_or_none(r['admin3']), int_or_none(r['pop']), float_or_none(r['latitude']) / 100, float_or_none(r['longitude']) / 100, int(r['id']), int(r['padded'])))) counter += 1 self.coordinates = {} # cases where admin1 and city share the same name # extended feature/hack #1 to resolve city when # only country and admin1 are specified self.same_ci_a1_name = {} for i, (n, c, a1, a2, a3, p, lat, lon, i_d, pad) in self.data: nn, nc, na1 = nstr(n), nstr(c), nstr(a1) self.coordinates[(lat, lon)] = i if nn == na1 and pad == 0: self.same_ci_a1_name[(nc, na1)] = n ci_set.add(nn) # store (lat, lon) self.kdtree = KDTree([[i, j] for i, j in self.coordinates.keys() if i is not None and j is not None]) # build regular expr dicts co_set = set(self.co_names.keys()) # add country name aliases into co_set co_set.update(self.co_aliases.keys()) self.co_reg = ManyRE(co_set) self.ci_reg = ManyRE(ci_set) # add admin1 name aliases into admin1_set admin1_set = set(self.admin_name.keys()) # build regular expression stores for co-admin1-ci self.admin1_reg = ManyRE(admin1_set) # add stopwords to prevent any 2-letter word in common usage # to be mis-interpretted as country or admin code two_letter_stop_words = set( ['BE', 'WE', '\xc3\xa0', 'YO', 'DO', 'YA', 'DE', 'DA', 'HA', 'BY', 'HE', 'AL', 'NI', 'LE', 'NO', 'LO', 'TU', 'TO', 'TI', 'TE', 'EM', 'EL', 'EN', 'IS', 'OS', 'AM', 'IT', 'AO', 'AN', 'AS', 'AT', 'IN', 'EU', 'ES', 'IF', 'ME', 'ON', 'OF', 'LA', 'MI', 'UP', 'SU', 'UM', 'UN', 'SO', 'NA', 'OU', 'MY', 'OR', 'SE', 'US']) self.co_code_reg = ManyRE([sw for sw in self.co_code.keys() if sw not in two_letter_stop_words]) self.admin1_code_reg1 = ManyRE(self.admin_code.keys()) self.admin1_code_reg2 = ManyRE([sw for sw in self.admin_code.keys() if sw not in two_letter_stop_words]) self.bguess = {} for i, (city, country, admin1, a2, a3, p, la, lo, i_d, pad)\ in self.data: ci, co, a = nstr(city), nstr(country), nstr(admin1) # value is list of admin1's that correspond to ci-co key # ci-co makes dictionary flatter # choose not to use co-admin1-ci as key to add more flexibility # for lookups if ci in self.bguess: if co in self.bguess[ci]: if a in self.bguess[ci][co]: # store original wg-records marked with pad = 0 # to head of the queue if pad == 0: self.bguess[ci][co][a].appendleft(i) else: self.bguess[ci][co][a].append(i) else: self.bguess[ci][co][a] = deque([i]) else: self.bguess[ci][co] = {a: deque([i])} else: self.bguess[ci] = {co: {a: deque([i])}}