def find_geoid(self, geoid: str, place: Loc) -> None: """ Lookup by geoid #Args: geoid: Geonames.org geoid place: Location fields in place are updated #Returns: None. Location fields in place are updated """ flags = ResultFlags(limited=False, filtered=False) place.geoid = geoid place.georow_list.clear() self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list, geoid=place.geoid, place=place) if len(place.georow_list) == 0: self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list, geoid=place.geoid, place=place, admin=True) if len(place.georow_list) > 0: place.result_type = GeoUtil.Result.STRONG_MATCH self.process_results(place=place, flags=flags) # self.logger.debug(f'found geoid {place.georow_list[0]}') else: place.result_type = GeoUtil.Result.NO_MATCH
def process_results(self, place: Loc, flags) -> None: """ Update fields in place record using first entry in place.georow_list Updates fields with available data: city, admin1, admin2, country, lat/long, feature, etc. #Args: place: Loc instance flags: Flags tuple as returned by sort_results #Returns: None. place instance fields are updated """ # self.logger.debug(f'**PROCESS RESULT: Res={place.result_type} Georow_list={place.georow_list}') if place.result_type == GeoUtil.Result.NOT_SUPPORTED: place.place_type = Loc.PlaceType.COUNTRY if place.result_type in GeoUtil.successful_match and len( place.georow_list) > 0: self.geo_build.geodb.copy_georow_to_place(row=place.georow_list[0], place=place, fast=False) elif len(place.georow_list ) > 0 and place.result_type != GeoUtil.Result.NOT_SUPPORTED: # self.logger.debug(f'***RESULT={place.result_type} Setting to Partial') place.result_type = GeoUtil.Result.PARTIAL_MATCH place.set_place_type_text()
def fast_score(self, target_place: Loc, result_place: Loc) -> float: # Get a rough, fast score for similarity between target and result. O is best. 100 is worst result_title = result_place.get_five_part_title() target_title = target_place.get_five_part_title() #self.logger.debug(f'F Score Result [{result_title}] targ [{target_title}] ') sc = 100 - fuzz.token_sort_ratio(result_title, target_title) #self.logger.debug(f'F Score={sc:.2f} Result [{result_title}] targ [{target_title}] ') return sc
def _find_type_as_city(self, place: Loc, typ) -> int: """ Do a lookup using the field specifed by typ as a city name. E.g. if typ is PlaceType.ADMIN1 then use the place.admin1_name field to do the city lookup #Args: place: Loc instance typ: Loc.PlaceType - Specifies which field to use as target for lookup #Returns: None place.georow_list is updated with matches """ # place.standard_parse = False typ_name = '' best = 999 if typ == Loc.PlaceType.CITY: # Try City as city (do as-is) typ_name = 'City' pass elif typ == Loc.PlaceType.ADMIN2: # Try ADMIN2 as city if place.admin2_name != '': # if '*' not in place.city: # place.prefix += ' ' + place.city place.city = place.admin2_name place.admin2_name = '' typ_name = 'Admin2' elif typ == Loc.PlaceType.PREFIX: # Try Prefix as City if place.prefix != '': place.city = place.prefix # if '*' not in tmp: # place.prefix = tmp typ_name = 'Prefix' elif typ == Loc.PlaceType.ADVANCED_SEARCH: # Advanced Search best = self.geo_build.geodb.lookup_place(place=place) return best else: self.logger.warning(f'Unknown TYPE {typ}') if typ_name != '': result_list = [] self.logger.debug( f'2) Try {typ_name} as City. Target={place.city} pref [{place.prefix}] ' ) place.place_type = Loc.PlaceType.CITY best = self.geo_build.geodb.s.lookup_place(place=place) #best_score = self.geo_build.geodb.assign_scores(result_list, place, '', fast=True, quiet=False) self.logger.debug(f'best={best}') if best >= MatchScore.Score.POOR_CUTOFF: self.logger.debug('--- DEEP SEARCH ADM2 ---') best = self.geo_build.geodb.s.deep_lookup(place=place) return best
def run_test_score(idx) -> int: in_place = Loc.Loc() res_place = Loc.Loc() TestScoring.prepare_test(idx, in_place, res_place) score = TestScoring.scoring.match_score(in_place, res_place) in_title = MatchScore.full_normalized_title(in_place) res_title = MatchScore.full_normalized_title(res_place) TestScoring.logger.debug(f' {idx}) {score:.1f} In=[{in_title}] Out=[{res_title}]') return score
def full_normalized_title(self, place: Loc) -> str: # Create a full normalized five part title (includes prefix) # Clean up prefix - remove any words that are in city, admin1 or admin2 from Prefix # place.prefix = Loc.Loc.matchscore_prefix(place.prefix, place.get_long_name(None)) title = place.get_five_part_title() title = self.norm.normalize_for_scoring(title) return title
def __init__(self, directory_name: str, display_progress, show_message: bool, exit_on_error: bool, languages_list_dct, feature_code_list_dct, supported_countries_dct, volume=''): """ Init #Args: directory_name: directory where geoname.org files are. DB will be in 'cache' folder under this display_progress: None or function to display progress(percent_done:int, msg:str) show_message: If True - show TKInter message dialog on error exit_on_error: If True - exit on significant error languages_list_dct: Dictionary of ISO-2 languages to import from AlternateNamesV2.txt feature_code_list_dct: Dictionary of Geoname Feature codes to import into DB supported_countries_dct: Dictionary of ISO-2 Country codes to import into DB """ self.logger = logging.getLogger(__name__) self.display_progress = display_progress self.save_place: Loc = Loc.Loc() self.miss_diag_file = None self.distance_cutoff = 0.6 # Value to determine if two lat/longs are similar based on Rectilinear Distance self.geo_build = GeodataBuild.GeodataBuild( str(directory_name), display_progress=self.display_progress, show_message=show_message, exit_on_error=exit_on_error, languages_list_dct=languages_list_dct, feature_code_list_dct=feature_code_list_dct, supported_countries_dct=supported_countries_dct, volume=volume)
def load_handler(self): """ The User pressed the LOAD button to load and review an Ancestry file. Switch app display to the Review layout Load in file name and call handle_place_entry() """ self.w.original_entry.text = "" self.w.remove_initialization_widgets() # Remove old widgets self.w.create_review_widgets( ) # Switch display from Initial widgets to review widgets self.load_data_files() ged_path = self.cfg.get( "gedcom_path") # Get saved config setting for file # Load appropriate ancestry file handler based on file type (Gramps XML or GEDCOM) if ged_path is not None: if '.ged' in ged_path: # GEDCOM self.out_suffix = "import.ged" self.ancestry_file_handler = Gedcom.Gedcom( in_path=ged_path, out_suffix=temp_suffix, cache_d=self.cache_dir, progress=None) # Routines to open and parse GEDCOM file elif '.gramps' in ged_path: # GRAMPS self.out_suffix = "import.gramps" # self.out_suffix = "csv" self.ancestry_file_handler = GrampsXml.GrampsXml( in_path=ged_path, out_suffix=temp_suffix, cache_d=self.cache_dir, progress=None, geodata=self.geodata ) # Routines to open and parse Gramps file else: self.out_suffix = 'unk.new.ged' messagebox.showwarning( f'UNKNOWN File type. Not .gramps and not .ged. \n\n{ged_path}') self.out_diag_file = open(ged_path + '.output.txt', 'wt') self.in_diag_file = open(ged_path + '.input.txt', 'wt') miss_diag_fname = ged_path + '.miss.txt' self.geodata.open_diag_file(miss_diag_fname) if self.ancestry_file_handler.error: TKHelper.fatal_error(f"File {ged_path} not found.") self.w.root.update() self.place: Loc.Loc = Loc.Loc( ) # Create an object to store info for the current Place # Add filename to Title path_parts = os.path.split(ged_path) # Extract filename from full path self.w.title.text = f'GEO FINDER v{__version__.__version__} - {path_parts[1]}' # Read file, find each place entry and handle it. self.w.user_entry.text = "Scanning to previous position..." self.process_place_entries()
def find_best_match(self, location: str, place: Loc) -> bool: """ Find the best scoring match for this location in the geoname dictionary. #Args: location: location name, e.g. Los Angeles, California, USA place: Loc instance #Returns: True if a match was found place is updated with -- lat, lon, district, city, country_iso, result code """ # First parse the location into <prefix>, city, <district2>, district1, country. # Then look it up in the place db res = self.find_matches(location, place) # Clear to just best entry flags = self.filter_results(place) # If multiple matches, truncate to first match if len(place.georow_list) > 0: place.georow_list = place.georow_list[:1] self.process_results(place=place, flags=flags) place.set_place_type() nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}' place.prefix = place.prefix_cleanup(place.prefix, nm) return True else: if res in GeoUtil.successful_match: nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}' place.prefix = place.prefix_cleanup(place.prefix, nm) #print(f'Found pre=[{place.prefix}{place.prefix_commas}] Nam=[{nm}]') return True return False
def run_test_inscore(idx) -> int: target_place = Loc.Loc() result_place = Loc.Loc() TestScoring.logger.debug(f'TEST INPUT SCORE:') TestScoring.prepare_test(idx, target_place, result_place) TestScoring.logger.debug(f'prepare_test: INP={target_place.city},{target_place.admin2_name},' f'{target_place.admin1_name},{target_place.country_name}' f' RES={result_place.city},{result_place.admin2_name},' f'{result_place.admin1_name},{result_place.country_name}') # Create full, normalized titles (prefix,city,county,state,country) result_title, result_tokens, target_title, target_tokens = MatchScore._prepare_input(target_place, result_place) # Calculate score for percent of input target text that matched result sc = TestScoring.scoring._calculate_weighted_score(target_tokens, result_tokens) print(f'#{idx} SCORE={sc:.1f} In={sc:.1f}[{target_place.original_entry.title().lower()}] [{result_place.get_five_part_title()}]') return sc
def _lookup_city_as_admin2(self, place: Loc, result_list) -> int: """ Lookup place.city as admin2 name #Args: place: result_list: #Returns: """ # Try City as ADMIN2 # place.standard_parse = False place.admin2_name = place.city place.city = '' place.place_type = Loc.PlaceType.ADMIN2 self.logger.debug( f' Try admin2 [{place.admin2_name}] as city [{place.get_five_part_title()}]' ) best = self.geo_build.geodb.lookup_place(place=place) result_list.extend(place.georow_list) return best
def __init__(self, geodb): self.logger = logging.getLogger(__name__) self.detailed_debug = True self.start = 0 self.use_wildcards = True self.total_lookups = 0 self.cache = {} self.place_type = '' self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx' self.geodb = geodb self.match = MatchScore.MatchScore() self.norm = Normalize.Normalize() self.place = Loc.Loc()
def is_country_valid(self, place: Loc) -> bool: """ See if COUNTRY is present and is in the supported country list #Args: place: #Returns: True if country is valid """ if place.country_iso == '': place.result_type = GeoUtil.Result.NO_COUNTRY is_valid = False elif place.country_iso not in self.geo_build.supported_countries_dct: self.logger.debug(f'Country [{place.country_iso}] not supported') place.result_type = GeoUtil.Result.NOT_SUPPORTED place.place_type = Loc.PlaceType.COUNTRY is_valid = False else: is_valid = True return is_valid
def lookup_place(self, place: Loc) -> []: """ **Lookup a place in geoname.org db** Lookup is based on place.place_type as follows: Loc.PlaceType.COUNTRY: does self.search_country(place) Loc.PlaceType.ADVANCED_SEARCH: does self.feature_search(place) Otherwise: do self.search_city(place) # Args: place: Loc instance. Call Loc.parse_place() before calling lookup_place() # Returns: Best score found place.georow_list contains a list of matching entries. Each entry has: Lat, Long, districtID (County or State or Province ID), and a match quality score """ place.result_type = Result.STRONG_MATCH best_score = MatchScore.Score.VERY_POOR if place.place_type == Loc.PlaceType.COUNTRY: # Country if place.georow_list: place.country_name = self.get_country_name(place.country_name) best_score = MatchScore.Score.VERY_GOOD else: # General search if place.place_type == Loc.PlaceType.ADMIN1: place.feature = "ADM1" place.georow_list.clear() best_score = self._search(place=place, georow_list=place.georow_list, name=place.city, admin1_id=place.admin1_id, admin2_id=place.admin2_id, iso=place.country_iso, feature=place.feature, sdx=get_soundex(place.city)) self.logger.debug(f'**LOOKUP PLACE score={best_score}') return best_score
def add_alias_to_db(self, ky: str, geo_build: GeodataBuild): alias_row = alias_list.get(ky) place = Loc.Loc() place.country_iso = alias_row[ALIAS_ISO].lower() place.city = alias_row[ALIAS_NAME] place.feature = alias_row[ALIAS_FEAT] place.place_type = Loc.PlaceType.CITY # Lookup main entry and get GEOID geo_build.geodb.s.lookup_place(place) if len(place.georow_list) > 0: if len(place.georow_list[0]) > 0: geo_row = list(place.georow_list[0][0:GeoUtil.Entry.SDX + 1]) geo_build.update_geo_row_name(geo_row=geo_row, name=ky) geo_tuple = tuple(geo_row) geo_build.insert(geo_tuple=geo_tuple, feat_code=alias_row[ALIAS_FEAT])
def __init__(self, directory: str, filename: str, progress_bar, prefix, geo_build: GeodataBuild, lang_list): """ Read in geonames alternate names file and add to geodata database in alt_names table # Args: directory: base directory for alternate names file filename: filename of geonames alternate_namesV2.txt file progress_bar: tkhelper progress bar or None geo_files: GeodataFiles instance lang_list: List of ISO languages we want to support, e.g. ['fr', 'es'] """ super().__init__(directory, filename, progress_bar, prefix=prefix) self.sub_dir = GeoUtil.get_cache_directory(directory) self.geo_build: GeodataBuild.GeodataBuild = geo_build self.lang_list = lang_list self.place = Loc.Loc() self.search = None
def create_enclosures(self): """ Walk through all entries and create any missing enclosure items """ self.logger.debug('\n\n******** DONE \n CREATE CSV ENCLOSURES *********') place = Loc.Loc() # Create any missing enclosure records # There are separate dictionaries for each tier (prefix, city, county, country). # We need to go through prefix dict, then city dict, etc (starting at bottom tier) for idx, dictionary in reversed(list(enumerate(self.hierarchy_dictionaries))): self.logger.debug(f'===TABLE {idx}===') for key in dictionary: _retrieve_csv_place(self.hierarchy_dictionaries, self.geodata, place, key, idx) self.logger.debug(f'** CSV {key} {place.original_entry}') # Create enclosure for each node at this level self._create_enclosed_by(place)
def load_handler(self): # Load in global replace list and display self.clear_display_list(self.tree) place = Loc.Loc() self.edit_entry.text = "Loading Replacement Dictionary" for key in sorted(self.dict): # Key is the original name. Value is @GEOID@PREFIX # replacement = self.dict[key] prefix, geoid = ReplacementDictionary.parse_replacement_entry(self.dict[key]) if geoid == '': self.logger.warning(f'blank item=[{key}] ') continue place.target = geoid # Lookup GEOID to get location info self.geodb.get_geoid(place=place) if len(place.georow_list) > 0: # Found it. Copy geo row to Place self.geodb.copy_georow_to_place(row=place.georow_list[0], place=place) else: if len(place.target) == 0: place.clear() place.city1 = f'<DELETE>' else: place.clear() place.city1 = f'Database error for GeoID {geoid}' place.place_type = Loc.PlaceType.CITY # Get prefix if there was one place.prefix = prefix place.set_place_type() nm = place.get_long_name(self.output_replace_dct) if len(place.prefix) > 0: line = f'[{place.prefix}]{place.prefix_commas}{nm}' else: line = f'{nm}' self.list_append(self.tree, key, line) self.edit_entry.text = ""
def lookup_place(self, location_name): # Create Location instance. This will hold search parameters and result place: Loc.Loc = Loc.Loc() # Find best match match = self.geodata.find_best_match(location=location_name, place=place) if match: # Create full name for result nm = f'{place.get_display_name(None)}' print( f' Best match for {location_name}:\n {nm} Prefix=[{place.prefix}{place.prefix_commas}] Score= {place.score:.1f}\n' ) else: if place.result_type == Geodata.GeoUtil.Result.NOT_SUPPORTED: print( f' NO match for {location_name}:\n Country NOT SUPPORTED: {place.country_name} \n' ) else: print(f' NO match for {location_name}:\n')
def find_matches(self, location: str, place: Loc): """ Find a location in the geoname database. On successful match, place.georow_list will contain a list of georows that matched the name. Each georow can be copied to a Loc structure by calling process_result #Args: location: comma separated name of location to find, e.g. 'Los Angeles, California, USA' place: Loc structure plain_search: If True then don't do wildcard searches #Returns: GeoUtil.Result code """ place.parse_place(place_name=location, geo_db=self.geo_build.geodb) best_score = 9999 self.is_country_valid(place) if place.result_type == GeoUtil.Result.NOT_SUPPORTED: place.georow_list.clear() return best_score # Create full entry text place.update_names(self.geo_build.output_replace_dct) flags = ResultFlags(limited=False, filtered=False) result_list = [ ] # We will do different search types and append all results into result_list # self.logger.debug(f'== FIND LOCATION City=[{place.city}] Adm2=[{place.admin2_name}]\ # Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}] Type={place.place_type} ') # Save a shallow copy of place so we can restore fields self.save_place = copy.copy(place) # After parsing, last token is either country or underscore. # Second to last is either Admin1 or underscore # If >2 tokens: token[0] is placed in City and in Prefix # If >3 tokens: token[1] is placed in Admin2 and appended to Prefix # 1) Try lookup based on standard parsing: lookup city, county, state/province, or country as parsed self.logger.debug( f' 1) Standard, based on parsing. pref [{place.prefix}] city [{place.city}]' f' sdx={GeoSearch.get_soundex(place.city)} ' f'feat={place.feature} typ=[{place.place_type}]') if place.place_type != Loc.PlaceType.COUNTRY and place.place_type != Loc.PlaceType.ADMIN1 \ and place.place_type != Loc.PlaceType.ADMIN1: self.logger.debug('find std place - not ADM* ') best_score = self.geo_build.geodb.s.lookup_place(place=place) self.logger.debug(f'std: best={best_score}') if place.georow_list: result_list.extend(place.georow_list) # self.logger.debug(result_list) if best_score >= MatchScore.Score.POOR_CUTOFF: # No good matches found. Try a deep search on soundex of combinations of terms self.logger.debug('--- DEEP SEARCH city ---') best_score = self.geo_build.geodb.s.deep_lookup(place=place) # self.logger.debug(place.georow_list) if place.georow_list: result_list.extend(place.georow_list) # Restore fields self._restore_fields(place, self.save_place) # 2) Try second token (Admin2) as a city if place.admin2_name != '': self.logger.debug(f'try 2nd token as city') place.georow_list.clear() best_score = self._find_type_as_city(place, Loc.PlaceType.ADMIN2) self.logger.debug(f'2nd token best={best_score}') if place.georow_list: result_list.extend(place.georow_list) # self.logger.debug(result_list) # See if we found any good scoring matches if best_score >= MatchScore.Score.POOR_CUTOFF: # No good matches found. Try a deep search on soundex of combinations of terms self.logger.debug('--- DEEP SEARCH city ---') best_score = self.geo_build.geodb.s.deep_lookup( place=place) # self.logger.debug(place.georow_list) if place.georow_list: result_list.extend(place.georow_list) self._restore_fields(place, self.save_place) # Move result_list into place georow list place.georow_list.clear() place.georow_list.extend(result_list) # self.logger.debug(place.georow_list) else: self.logger.debug('not country, adm1, adm2') return place.result_type if len(place.georow_list) > 0: best_score = self.geo_build.geodb._assign_scores(place.georow_list, place, '', fast=False, quiet=True) # self.logger.debug('process results') self.process_results(place=place, flags=flags) flags = self.filter_results(place) # self.logger.debug(place.georow_list) if len(place.georow_list) == 0: # NO MATCH if place.result_type != GeoUtil.Result.NO_COUNTRY and place.result_type != GeoUtil.Result.NOT_SUPPORTED: place.result_type = GeoUtil.Result.NO_MATCH self.logger.debug(f'Not found.') #place.result_type = GeoUtil.Result.STRONG_MATCH else: self.logger.debug('Found country') elif len(place.georow_list) > 1: self.logger.debug(f'Success! {len(place.georow_list)} matches') place.result_type = GeoUtil.Result.MULTIPLE_MATCHES # Process the results self.process_results(place=place, flags=flags) # self.logger.debug(f'Status={place.status}') return place.result_type
def filter_results(self, place: Loc): """ Sort place.georow_list by match score and eliminate duplicates In case of duplicate, keep the one with best match score. See MatchScore.match_score() for details on score calculation Discard names that didnt exist at time of event (update result flag if this occurs) Duplicates are defined as two items with: 1) same GEOID or 2) same name and similar lat/lon (within Rectilinear Distance of distance_cutoff degrees) Add flag if we hit the lookup limit #Args: place: #Returns: ResultFlags(limited=limited_flag, filtered=date_filtered) """ date_filtered = False # Flag to indicate whether we dropped locations due to event date # event_year = place.event_year if len(place.georow_list) > 100: limited_flag = True else: limited_flag = False if len(place.georow_list) == 0: self.logger.debug('EMPTY') return ResultFlags(limited=limited_flag, filtered=date_filtered) # Remove duplicate locations in list (have same name and lat/lon) self.remove_duplicates(place) if len(place.georow_list) == 0: self.logger.error(f'georow_list = 0') return ResultFlags(limited=limited_flag, filtered=date_filtered) gap_threshold = 0 score = 0 # Sort places in match_score order new_list = sorted(place.georow_list, key=itemgetter(GeoUtil.Entry.SCORE, GeoUtil.Entry.ADM1)) if len(new_list) == 0: self.logger.error(f'new_list = 0') return ResultFlags(limited=limited_flag, filtered=date_filtered) if len(new_list[0]) < GeoUtil.Entry.SCORE + 1: self.logger.debug(f'len = {len(new_list[0])}') self.logger.debug(f'[{new_list[0]}]') return ResultFlags(limited=limited_flag, filtered=date_filtered) min_score = new_list[0][GeoUtil.Entry.SCORE] place.georow_list.clear() # Go through sorted list and only add items to georow_list that are close to the best score for rw, geo_row in enumerate(new_list): score = geo_row[GeoUtil.Entry.SCORE] # admin1_name = self.geo_build.geodb.get_admin1_name_direct(geo_row[GeoUtil.Entry.ADM1], geo_row[GeoUtil.Entry.ISO]) # admin2_name = self.geo_build.geodb.get_admin2_name_direct(geo_row[GeoUtil.Entry.ADM1], # geo_row[GeoUtil.Entry.ADM2], geo_row[GeoUtil.Entry.ISO]) base = MatchScore.Score.VERY_GOOD + (MatchScore.Score.GOOD / 3) gap_threshold = base + abs(min_score) * .6 # Range to display when there is a strong match # if (min_score <= base and score > min_score + gap_threshold) or score > MatchScore.Score.VERY_POOR * 1.5: if score > min_score + gap_threshold: self.logger.debug( f'SKIP Score={score:.1f} Min={min_score:.1f} Gap={gap_threshold:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}]' f' {geo_row[GeoUtil.Entry.NAME]},' f' {geo_row[GeoUtil.Entry.ADM2]},' f' {geo_row[GeoUtil.Entry.ADM1]} ') else: place.georow_list.append(geo_row) self.logger.debug( f'Score {score:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}] {geo_row[GeoUtil.Entry.NAME]}, ' f'AD2={geo_row[GeoUtil.Entry.ADM2]},' f' AD1={geo_row[GeoUtil.Entry.ADM1]} {geo_row[GeoUtil.Entry.ISO]}' ) # self.logger.debug(f'min={min_score:.1f}, gap2={gap_threshold:.1f} strong cutoff={min_score + gap_threshold:.1f}') if min_score <= MatchScore.Score.VERY_GOOD and len( place.georow_list ) == 1 and place.result_type != GeoUtil.Result.NOT_SUPPORTED: place.result_type = GeoUtil.Result.STRONG_MATCH else: # Log item that we couldnt match if self.miss_diag_file: self.miss_diag_file.write( f'Lookup {place.original_entry} thresh={gap_threshold} gap={score - min_score}\n\n' ) return ResultFlags(limited=limited_flag, filtered=date_filtered)
def match_score(self, target_place: Loc, result_place: Loc, fast=False) -> float: """ Calculate a heuristic score for how well a result place name matches a target place name. The score is based on percent of characters that didnt match in input and output (plus other items described below). Mismatch score is 0-100% reflecting the percent mismatch between the user input and the result. This is then adjusted by Feature type (large city gives best score) plus other items to give a final heuristic where -10 is perfect match of a large city and 100 is no match. A) Heuristic: 1) Create 5 part title (prefix, city, county, state/province, country) 2) Normalize text - self.norm.normalize_for_scoring() 3) Remove sequences of 2 chars or more that match in target and result 4) Calculate inscore - percent of characters in input that didn't match result. Weight by term (city,,county,state,ctry) Exact match of city term gets a bonus 5) Calculate result score - percent of characters in db result that didn't match input B) Score components (All are weighted in final score): in_score - (0-100) - score for input that didnt match output feature_score - (0-100) More important features get lower score. City with 1M population is zero. Valley is 100. Geodata.feature_priority(). wildcard_penalty - score is raised by X if it includes a wildcard prefix_penalty - score is raised by length of Prefix C) A standard text difference, such as Levenstein, was not used because those treat both strings as equal, whereas this treats the User text as more important than DB result text and also weights each token. A user's text might commonly be something like: Paris, France and a DB result of Paris, Paris, Ile De France, France. The Levenstein distance would be large, but with this heuristic, the middle terms can have lower weights, and having all the input matched can be weighted higher than mismatches on the county and province. This heuristic gives a score of -9 for Paris, France. # Args: target_place: Loc with users entry. result_place: Loc with DB result. # Returns: score """ if fast: return self.fast_score(target_place, result_place) self.score_diags = '' # Diagnostic text for scoring self.timing = 0 save_prefix = target_place.prefix #self.logger.debug(f'pref={target_place.prefix}') # Remove items in prefix that are in result if target_place.place_type != Loc.PlaceType.ADVANCED_SEARCH: target_place.prefix = self.norm.normalize_for_scoring( target_place.prefix) result_name = result_place.get_long_name(None) target_place.prefix = Loc.Loc.fast_prefix(target_place.prefix, result_name) else: target_place.updated_entry = target_place.get_long_name(None) # Create full, normalized titles (prefix,city,county,state,country) result_title, result_tokens, target_title, target_tokens = self._prepare_input( target_place, result_place) #self.logger.debug(f'Res [{result_tokens}] Targ [{target_tokens}] ') # Calculate Prefix score. Prefix is not used in search and longer is generally worse prefix_score = _calculate_prefix_penalty(target_place.prefix) # Calculate score for percent of input target text that matched result in_score = self._calculate_weighted_score(target_tokens, result_tokens) # Calculate score for wildcard search - wildcard searches are missing letters and need special handling wildcard_score = self._calculate_wildcard_score( target_place.original_entry) # Calculate Feature score - this ensures "important" places get higher rank (large city, etc) feature_score = Geodata.Geodata._feature_priority(result_place.feature) # Weight and add up scores - Each item is 0-100 and then weighted, except wildcard penalty score: float = in_score * self.input_weight + feature_score * self.feature_weight + \ prefix_score * self.prefix_weight + wildcard_score self.logger.debug( f'SCORE {score:.1f} res=[{result_title}] pref=[{target_place.prefix}]' f'inSc={in_score * self.input_weight:.1f}% feat={feature_score * self.feature_weight:.1f} {result_place.feature} ' f'wild={wildcard_score} pref={prefix_score * self.prefix_weight:.1f}' ) self.logger.debug(self.score_diags) target_place.prefix = save_prefix return score + 8
def __init__(self, directory: str, display_progress, show_message: bool, exit_on_error: bool, languages_list_dct: {}, feature_code_list_dct: {}, supported_countries_dct: {}, volume=''): """ Read in datafiles needed for geodata, filter them and create a sql db. Filter dictionary examples: languages_list_dct={'fr','de'} feature_code_list_dct={'PPL', 'ADM1', 'CSTL'} supported_countries_dct = {'us','gb','at'} # Args: directory: base directory display_progress: None or Handler called with percent_done:int, msg:str show_message: True to show message boxes to user on errors exit_on_error: True to exit on serious errors languages_list_dct: dictionary containing the ISO-2 languages to load from alternateNames feature_code_list_dct: dictionary containing the Geonames.org feature codes to load supported_countries_dct: dictionary containing the ISO-2 countries to load volume: disk volume to use - e.g. C: for Windows or /Volumes/xyz for OSX, /media/xyz for linux """ self.logger = logging.getLogger(__name__) self.geodb: [GeoDB.GeoDB, None] = None self.show_message = show_message self.geoid_main_dict = {} # Key is GEOID, Value is DB ID for entry self.geoid_admin_dict = {} # Key is GEOID, Value is DB ID for entry # TODO fix volume handling self.volume = volume self.collate = 'COLLATE NOCASE' self.exit_on_error = exit_on_error self.required_db_version = 4 # Message to user upgrading from earlier DB version self.db_upgrade_text = 'Renamed column to Feature' self.directory: str = directory self.progress_bar = display_progress self.line_num = 0 self.cache_changed: bool = False sub_dir = GeoUtil.get_cache_directory(self.directory) self.country = None self.languages_list_dct = languages_list_dct self.feature_code_list_dct = feature_code_list_dct self.supported_countries_dct = supported_countries_dct self.lang_list = [] self.norm = Normalize.Normalize() for item in self.languages_list_dct: self.lang_list.append(item) if volume != '': os.chdir(volume) if not os.path.exists(sub_dir): self.logger.warning(f'Directory] {sub_dir} NOT FOUND') if self.show_message: messagebox.showwarning( 'Folder not found', f'Directory\n\n {sub_dir}\n\n NOT FOUND') if exit_on_error: sys.exit() # Read in Text Replacement dictionary pickle - this has output text replacements self.output_replace_cd = CachedDictionary.CachedDictionary( sub_dir, "output_list.pkl") self.output_replace_cd.read() self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict self.output_replace_list = [] for item in self.output_replace_dct: self.output_replace_list.append(item) self.entry_place = Loc.Loc() # Support for Geonames AlternateNames file. Adds alternate names for entries self.alternate_names = AlternateNames.AlternateNames( directory=self.directory, geo_build=self, progress_bar=self.progress_bar, prefix="Step 3 of 4) ", filename='alternateNamesV2.txt', lang_list=self.lang_list)
def copy_georow_to_place(self, row, place: Loc, fast: bool): """ Copy data from DB row into place instance Country, admin1_id, admin2_id, city, lat/lon, feature, geoid are updated if available #Args: row: georow from geoname database place: Loc instance fast: Currently ignored #Returns: None. Place instance is updated with data from georow """ place.admin1_id = '' place.admin2_id = '' place.admin1_name = '' place.admin2_name = '' place.city = '' place.country_iso = str(row[Entry.ISO]) place.lat = row[Entry.LAT] place.lon = row[Entry.LON] place.feature = str(row[Entry.FEAT]) place.geoid = str(row[Entry.ID]) place.prefix = row[Entry.PREFIX] place.place_type = Loc.PlaceType.CITY if place.feature == 'ADM0': place.place_type = Loc.PlaceType.COUNTRY pass elif place.feature == 'ADM1': place.admin1_id = row[Entry.ADM1] place.place_type = Loc.PlaceType.ADMIN1 elif place.feature == 'ADM2': place.admin1_id = row[Entry.ADM1] place.admin2_id = row[Entry.ADM2] place.place_type = Loc.PlaceType.ADMIN2 else: place.admin1_id = row[Entry.ADM1] place.admin2_id = row[Entry.ADM2] place.city = row[Entry.NAME] self.s.update_names(place) if place.admin2_name is None: place.admin2_name = '' if place.admin1_name is None: place.admin1_name = '' place.city = str(place.city) if place.city is None: place.city = '' try: place.score = row[Entry.SCORE] except IndexError: pass
def setUp(self) -> None: self.place: Loc.Loc = Loc.Loc()
def _assign_scores(self, georow_list, place, target_feature, fast=False, quiet=False) -> float: """ Assign match score to each result in list Args: place: target_feature: fast: quiet: if True, set logging to INFO Returns: """ result_place: Loc = Loc.Loc() start = time.time() best_score = 999.9 if place == None: # Add dummy match quality score to each entry for idx, rw in enumerate(georow_list): update = list(rw) if len(update) < Entry.SCORE + 1: update.append(1) georow_list[idx] = tuple(update) return best_score original_prefix = place.prefix # If quiet, then only log at INFO level lev = logging.getLogger().getEffectiveLevel() #if quiet: # logging.getLogger().setLevel(logging.INFO) # Add match quality score and prefix to each entry for idx, rw in enumerate(georow_list): place.prefix = original_prefix if len(rw) == 0: continue # self.logger.debug(rw) self.copy_georow_to_place(row=rw, place=result_place, fast=fast) result_place.original_entry = result_place.get_long_name(None) # self.logger.debug(f'plac feat=[{place.feature}] targ=[{target_feature}]') if result_place.feature == target_feature: bonus = 10.0 else: bonus = 0 if len(place.prefix) > 0 and result_place.prefix == '': result_place.prefix = ' ' else: result_place.prefix = '' score = self.match.match_score(target_place=place, result_place=result_place, fast=fast) - bonus best_score = min(best_score, score) # Convert row tuple to list and extend so we can assign score update = list(rw) if len(update) < Entry.SCORE + 1: update.append(1) update[Entry.SCORE] = score result_place.prefix = self.norm.normalize(place.prefix, True) update[Entry.PREFIX] = result_place.prefix georow_list[idx] = tuple(update) # Convert back from list to tuple # self.logger.debug(f'{update[GeoUtil.Entry.SCORE]:.1f} {update[GeoUtil.Entry.NAME]} [{update[GeoUtil.Entry.PREFIX]}]') # if len(georow_list) > 0: # self.logger.debug(f'min={min_score} {georow_list[0]}') if best_score < MatchScore.Score.STRONG_CUTOFF: place.result_type = Result.STRONG_MATCH # Restore logging level logging.getLogger().setLevel(lev) elapsed = time.time() - start self.logger.debug( f'assign_scores min={best_score} elapsed={elapsed:.3f}') return best_score
def setUp(self) -> None: TestScoring.in_place: Loc.Loc = Loc.Loc() TestScoring.out_place: Loc.Loc = Loc.Loc()