def find_geoid(self, geoid: str, place: Loc) -> None: """ Lookup by geoid #Args: geoid: Geonames.org geoid place: Location fields in place are updated #Returns: None. Location fields in place are updated """ flags = ResultFlags(limited=False, filtered=False) place.geoid = geoid place.georow_list.clear() self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list, geoid=place.geoid, place=place) if len(place.georow_list) == 0: self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list, geoid=place.geoid, place=place, admin=True) if len(place.georow_list) > 0: place.result_type = GeoUtil.Result.STRONG_MATCH self.process_results(place=place, flags=flags) # self.logger.debug(f'found geoid {place.georow_list[0]}') else: place.result_type = GeoUtil.Result.NO_MATCH
def process_results(self, place: Loc, flags) -> None: """ Update fields in place record using first entry in place.georow_list Updates fields with available data: city, admin1, admin2, country, lat/long, feature, etc. #Args: place: Loc instance flags: Flags tuple as returned by sort_results #Returns: None. place instance fields are updated """ # self.logger.debug(f'**PROCESS RESULT: Res={place.result_type} Georow_list={place.georow_list}') if place.result_type == GeoUtil.Result.NOT_SUPPORTED: place.place_type = Loc.PlaceType.COUNTRY if place.result_type in GeoUtil.successful_match and len( place.georow_list) > 0: self.geo_build.geodb.copy_georow_to_place(row=place.georow_list[0], place=place, fast=False) elif len(place.georow_list ) > 0 and place.result_type != GeoUtil.Result.NOT_SUPPORTED: # self.logger.debug(f'***RESULT={place.result_type} Setting to Partial') place.result_type = GeoUtil.Result.PARTIAL_MATCH place.set_place_type_text()
def is_country_valid(self, place: Loc) -> bool: """ See if COUNTRY is present and is in the supported country list #Args: place: #Returns: True if country is valid """ if place.country_iso == '': place.result_type = GeoUtil.Result.NO_COUNTRY is_valid = False elif place.country_iso not in self.geo_build.supported_countries_dct: self.logger.debug(f'Country [{place.country_iso}] not supported') place.result_type = GeoUtil.Result.NOT_SUPPORTED place.place_type = Loc.PlaceType.COUNTRY is_valid = False else: is_valid = True return is_valid
def lookup_place(self, place: Loc) -> []: """ **Lookup a place in geoname.org db** Lookup is based on place.place_type as follows: Loc.PlaceType.COUNTRY: does self.search_country(place) Loc.PlaceType.ADVANCED_SEARCH: does self.feature_search(place) Otherwise: do self.search_city(place) # Args: place: Loc instance. Call Loc.parse_place() before calling lookup_place() # Returns: Best score found place.georow_list contains a list of matching entries. Each entry has: Lat, Long, districtID (County or State or Province ID), and a match quality score """ place.result_type = Result.STRONG_MATCH best_score = MatchScore.Score.VERY_POOR if place.place_type == Loc.PlaceType.COUNTRY: # Country if place.georow_list: place.country_name = self.get_country_name(place.country_name) best_score = MatchScore.Score.VERY_GOOD else: # General search if place.place_type == Loc.PlaceType.ADMIN1: place.feature = "ADM1" place.georow_list.clear() best_score = self._search(place=place, georow_list=place.georow_list, name=place.city, admin1_id=place.admin1_id, admin2_id=place.admin2_id, iso=place.country_iso, feature=place.feature, sdx=get_soundex(place.city)) self.logger.debug(f'**LOOKUP PLACE score={best_score}') return best_score
def find_matches(self, location: str, place: Loc): """ Find a location in the geoname database. On successful match, place.georow_list will contain a list of georows that matched the name. Each georow can be copied to a Loc structure by calling process_result #Args: location: comma separated name of location to find, e.g. 'Los Angeles, California, USA' place: Loc structure plain_search: If True then don't do wildcard searches #Returns: GeoUtil.Result code """ place.parse_place(place_name=location, geo_db=self.geo_build.geodb) best_score = 9999 self.is_country_valid(place) if place.result_type == GeoUtil.Result.NOT_SUPPORTED: place.georow_list.clear() return best_score # Create full entry text place.update_names(self.geo_build.output_replace_dct) flags = ResultFlags(limited=False, filtered=False) result_list = [ ] # We will do different search types and append all results into result_list # self.logger.debug(f'== FIND LOCATION City=[{place.city}] Adm2=[{place.admin2_name}]\ # Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}] Type={place.place_type} ') # Save a shallow copy of place so we can restore fields self.save_place = copy.copy(place) # After parsing, last token is either country or underscore. # Second to last is either Admin1 or underscore # If >2 tokens: token[0] is placed in City and in Prefix # If >3 tokens: token[1] is placed in Admin2 and appended to Prefix # 1) Try lookup based on standard parsing: lookup city, county, state/province, or country as parsed self.logger.debug( f' 1) Standard, based on parsing. pref [{place.prefix}] city [{place.city}]' f' sdx={GeoSearch.get_soundex(place.city)} ' f'feat={place.feature} typ=[{place.place_type}]') if place.place_type != Loc.PlaceType.COUNTRY and place.place_type != Loc.PlaceType.ADMIN1 \ and place.place_type != Loc.PlaceType.ADMIN1: self.logger.debug('find std place - not ADM* ') best_score = self.geo_build.geodb.s.lookup_place(place=place) self.logger.debug(f'std: best={best_score}') if place.georow_list: result_list.extend(place.georow_list) # self.logger.debug(result_list) if best_score >= MatchScore.Score.POOR_CUTOFF: # No good matches found. Try a deep search on soundex of combinations of terms self.logger.debug('--- DEEP SEARCH city ---') best_score = self.geo_build.geodb.s.deep_lookup(place=place) # self.logger.debug(place.georow_list) if place.georow_list: result_list.extend(place.georow_list) # Restore fields self._restore_fields(place, self.save_place) # 2) Try second token (Admin2) as a city if place.admin2_name != '': self.logger.debug(f'try 2nd token as city') place.georow_list.clear() best_score = self._find_type_as_city(place, Loc.PlaceType.ADMIN2) self.logger.debug(f'2nd token best={best_score}') if place.georow_list: result_list.extend(place.georow_list) # self.logger.debug(result_list) # See if we found any good scoring matches if best_score >= MatchScore.Score.POOR_CUTOFF: # No good matches found. Try a deep search on soundex of combinations of terms self.logger.debug('--- DEEP SEARCH city ---') best_score = self.geo_build.geodb.s.deep_lookup( place=place) # self.logger.debug(place.georow_list) if place.georow_list: result_list.extend(place.georow_list) self._restore_fields(place, self.save_place) # Move result_list into place georow list place.georow_list.clear() place.georow_list.extend(result_list) # self.logger.debug(place.georow_list) else: self.logger.debug('not country, adm1, adm2') return place.result_type if len(place.georow_list) > 0: best_score = self.geo_build.geodb._assign_scores(place.georow_list, place, '', fast=False, quiet=True) # self.logger.debug('process results') self.process_results(place=place, flags=flags) flags = self.filter_results(place) # self.logger.debug(place.georow_list) if len(place.georow_list) == 0: # NO MATCH if place.result_type != GeoUtil.Result.NO_COUNTRY and place.result_type != GeoUtil.Result.NOT_SUPPORTED: place.result_type = GeoUtil.Result.NO_MATCH self.logger.debug(f'Not found.') #place.result_type = GeoUtil.Result.STRONG_MATCH else: self.logger.debug('Found country') elif len(place.georow_list) > 1: self.logger.debug(f'Success! {len(place.georow_list)} matches') place.result_type = GeoUtil.Result.MULTIPLE_MATCHES # Process the results self.process_results(place=place, flags=flags) # self.logger.debug(f'Status={place.status}') return place.result_type
def filter_results(self, place: Loc): """ Sort place.georow_list by match score and eliminate duplicates In case of duplicate, keep the one with best match score. See MatchScore.match_score() for details on score calculation Discard names that didnt exist at time of event (update result flag if this occurs) Duplicates are defined as two items with: 1) same GEOID or 2) same name and similar lat/lon (within Rectilinear Distance of distance_cutoff degrees) Add flag if we hit the lookup limit #Args: place: #Returns: ResultFlags(limited=limited_flag, filtered=date_filtered) """ date_filtered = False # Flag to indicate whether we dropped locations due to event date # event_year = place.event_year if len(place.georow_list) > 100: limited_flag = True else: limited_flag = False if len(place.georow_list) == 0: self.logger.debug('EMPTY') return ResultFlags(limited=limited_flag, filtered=date_filtered) # Remove duplicate locations in list (have same name and lat/lon) self.remove_duplicates(place) if len(place.georow_list) == 0: self.logger.error(f'georow_list = 0') return ResultFlags(limited=limited_flag, filtered=date_filtered) gap_threshold = 0 score = 0 # Sort places in match_score order new_list = sorted(place.georow_list, key=itemgetter(GeoUtil.Entry.SCORE, GeoUtil.Entry.ADM1)) if len(new_list) == 0: self.logger.error(f'new_list = 0') return ResultFlags(limited=limited_flag, filtered=date_filtered) if len(new_list[0]) < GeoUtil.Entry.SCORE + 1: self.logger.debug(f'len = {len(new_list[0])}') self.logger.debug(f'[{new_list[0]}]') return ResultFlags(limited=limited_flag, filtered=date_filtered) min_score = new_list[0][GeoUtil.Entry.SCORE] place.georow_list.clear() # Go through sorted list and only add items to georow_list that are close to the best score for rw, geo_row in enumerate(new_list): score = geo_row[GeoUtil.Entry.SCORE] # admin1_name = self.geo_build.geodb.get_admin1_name_direct(geo_row[GeoUtil.Entry.ADM1], geo_row[GeoUtil.Entry.ISO]) # admin2_name = self.geo_build.geodb.get_admin2_name_direct(geo_row[GeoUtil.Entry.ADM1], # geo_row[GeoUtil.Entry.ADM2], geo_row[GeoUtil.Entry.ISO]) base = MatchScore.Score.VERY_GOOD + (MatchScore.Score.GOOD / 3) gap_threshold = base + abs(min_score) * .6 # Range to display when there is a strong match # if (min_score <= base and score > min_score + gap_threshold) or score > MatchScore.Score.VERY_POOR * 1.5: if score > min_score + gap_threshold: self.logger.debug( f'SKIP Score={score:.1f} Min={min_score:.1f} Gap={gap_threshold:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}]' f' {geo_row[GeoUtil.Entry.NAME]},' f' {geo_row[GeoUtil.Entry.ADM2]},' f' {geo_row[GeoUtil.Entry.ADM1]} ') else: place.georow_list.append(geo_row) self.logger.debug( f'Score {score:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}] {geo_row[GeoUtil.Entry.NAME]}, ' f'AD2={geo_row[GeoUtil.Entry.ADM2]},' f' AD1={geo_row[GeoUtil.Entry.ADM1]} {geo_row[GeoUtil.Entry.ISO]}' ) # self.logger.debug(f'min={min_score:.1f}, gap2={gap_threshold:.1f} strong cutoff={min_score + gap_threshold:.1f}') if min_score <= MatchScore.Score.VERY_GOOD and len( place.georow_list ) == 1 and place.result_type != GeoUtil.Result.NOT_SUPPORTED: place.result_type = GeoUtil.Result.STRONG_MATCH else: # Log item that we couldnt match if self.miss_diag_file: self.miss_diag_file.write( f'Lookup {place.original_entry} thresh={gap_threshold} gap={score - min_score}\n\n' ) return ResultFlags(limited=limited_flag, filtered=date_filtered)