def get_soundex_by_word(text: str) -> str: result_sdx = '' text_words = text.split(' ') for word in text_words: if len(result_sdx) == 0: result_sdx = GeoSearch.get_soundex(Loc.sort_words(word)) else: result_sdx += ' ' + GeoSearch.get_soundex(Loc.sort_words(word)) return result_sdx
def prefix_cleanup(pref: str, result: str) -> str: """ Remove any items from prefix that are in match result. Remove * #Args: pref: result: #Returns: Prefix with words removed """ new_prfx = pref.lower() new_prfx = new_prfx.strip(' ') prefix_parts = new_prfx.split(',') result = result.lower() result_parts = result.split(',') # Walk thru each segment in result for result_segment_idx, result_segment in enumerate(result_parts): result_sdx = ' ' + Loc.get_soundex_by_word(result_segment) + ' ' result_segment = ' ' + result_segment + ' ' # Walk thru each segment in prefix for prefix_segment_idx, prefix_segment in enumerate(prefix_parts): prefix_words = prefix_segment.split(' ') # Walk thru each word in prefix segment for pref_word_idx, prefix_word in enumerate(prefix_words): prefix_sdx = ' ' + GeoSearch.get_soundex(prefix_word) + ' ' if len(prefix_word) < 3: prefix_word += ' ' # Remove words in prefix that are in result_segment if (prefix_word in result_segment and prefix_word != '') or (prefix_sdx in result_sdx and prefix_sdx != ''): result_segment = remove_item(prefix_word.strip(' '), result_segment) new_prfx = remove_item(prefix_word.strip(' '), new_prfx) pass # See if any words in result_segment are in prefix result_words = result_segment.split(' ') if len(prefix_word) > 0: # Walk through each word in result for result_word_idx, result_word in enumerate( result_words): if len(result_word) < 3: result_word = result_word + ' ' if result_word in prefix_word and float( len(result_word)) / float( len(prefix_word)) > 0.6: # Remove result_word from result and prefix result_segment = remove_item( result_word, result_segment) new_prfx = remove_item(result_word, new_prfx) res = re.sub('[,]', '', new_prfx) res = res.strip(' ') res = res.strip(',') return res
def insert_alternate_name(self, alternate_name: str, geoid: str, lang: str): """ Add alternate name to altname table #Args: alternate_name: alternate name to add for this geoid geoid: geonames.org geoid lang: ISO lang code for this entry #Returns: None """ sdx = GeoSearch.get_soundex(self.norm.normalize(alternate_name, True)) row = (alternate_name, lang, geoid, sdx) sql = ''' INSERT OR IGNORE INTO altname(name,lang, geoid, sdx) VALUES(?,?,?,?) ''' self.geodb.db.execute(sql, row)
def update_geo_row_name(self, geo_row: [], name: str, normalize=True): """ Update the name entry and soundex entry with a new location name #Args: geo_row: name: location name """ if normalize: geo_row[GeoSearch.Entry.NAME] = self.norm.normalize( name, remove_commas=True) # todo remove test #if 'saint ' in geo_row[GeoSearch.Entry.NAME]: # print(geo_row) # xx = xyz else: geo_row[GeoSearch.Entry.NAME] = name.lower() #if 'saint ' in geo_row[GeoSearch.Entry.NAME]: # print(geo_row) # xx = xyz geo_row[GeoSearch.Entry.SDX] = GeoSearch.get_soundex( geo_row[GeoSearch.Entry.NAME])
def __init__(self, db_path, show_message: bool, exit_on_error: bool, set_speed_pragmas: bool, db_limit: int): """ geoname data database init. Open database if present otherwise raise error # Args: db_path: full path to database file show_message: If True, show messagebox to user on error exit_on_error: If True, exit if significant error occurs set_speed_pragmas: If True, set DB pragmas for maximum performance. db_limit: SQL LIMIT parameter # Raises: ValueError('Cannot open database'), ValueError('Database empty or corrupt') """ self.logger = logging.getLogger(__name__) self.show_message = show_message self.exit_on_error = exit_on_error self.max_query_results = 50 self.total_time = 0 self.total_lookups = 0 self.slow_lookup = 0 self.match = MatchScore.MatchScore() self.norm = Normalize.Normalize() #self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx' self.db_path = db_path # See if DB exists if os.path.exists(db_path): db_existed = True else: db_existed = False self.db = DB.DB(db_filename=db_path, show_message=show_message, exit_on_error=exit_on_error) if self.db.err != '': self.logger.error(f"Error! cannot open database {db_path}.") raise ValueError('Cannot open database') # If DB was initially found if db_existed: # Run sanity test on DB err = self.db.test_database('name', 'main.geodata', where='name = ? AND country = ?', args=('ba', 'fr')) if err: # DB failed sanity test self.logger.warning(f'DB error for {db_path}') if show_message: if messagebox.askyesno( 'Error', f'Geoname database is empty or corrupt:\n\n {db_path} \n\nDo you want to delete it and rebuild?' ): messagebox.showinfo('', 'Deleting Geoname database') self.db.conn.close() os.remove(db_path) if exit_on_error: sys.exit() else: raise ValueError('Database empty or corrupt') if set_speed_pragmas: self.db.set_speed_pragmas() self.db_limit = db_limit self.db.order_string = '' self.db.limit_string = f'LIMIT {self.db_limit}' self.place_type = '' self.s: GeoSearch.GeoSearch = GeoSearch.GeoSearch(geodb=self)
def _weighted_score(self, target_tokens: [], result_tokens: []) -> float: num_inp_tokens = 0.0 score = 0.0 bonus = 0.0 token_weight = copy.copy(self.token_weight) # Set weighting to half if target token for that term is blank #for idx, item in enumerate(target_tokens): # if len(item) == 0: # token_weight[idx] *= 0.5 #self.logger.debug(f'[{target_tokens}] [{result_tokens}]') # Calculate difference each target segment to result segment (city, county, state/province, country) # Each segment can have a different weighting. e.g. county can have lower weighting for idx, segment in enumerate(target_tokens): if idx < len(result_tokens) and idx > 0: #self.logger.debug(f'{idx}) Targ [{target_tokens[idx]}] Res [{result_tokens[idx]}]') if len(target_tokens[idx]) > 0: # Calculate fuzzy Levenstein distance between words, smaller is better start = time.time() fz = 100.0 - fuzz.ratio(target_tokens[idx], result_tokens[idx]) self.timing += (time.time() - start) # Calculate fuzzy Levenstein distance between Soundex of words sd = 100.0 - fuzz.ratio( GeoSearch.get_soundex(target_tokens[idx]), GeoSearch.get_soundex(result_tokens[idx])) value = fz * 0.6 + sd * 0.4 #self.logger.debug(f' Val={value} Fuzz Text={fz:.1f} SDX={sd:.1f}') # Extra bonus for good match if value < 10: #self.logger.debug(' Good match -10') value -= 6 if target_tokens[idx][0:5] == result_tokens[idx][0:5]: # Bonus if first letters match #self.logger.debug(' First letter match -5') value -= 3 elif len(result_tokens[idx]) > 0: # Target had no entry for this term # Give a penalty if target didn't have country if idx == COUNTRY_IDX: value = 15.0 elif idx == ADMIN1_IDX: # Give a penalty if target didn't have state/province value = 5.0 else: value = 5.0 #self.logger.debug(txt) else: if idx == COUNTRY_IDX: value = 15.0 elif idx == ADMIN1_IDX: # Give a penalty if target didn't have state/province value = 5.0 else: value = 0 score += value * token_weight[idx] num_inp_tokens += token_weight[idx] self.score_diags += f' {idx}) {value:.1f} [{result_tokens[idx]}]' else: #self.logger.warning(f'Short Result len={len(result_tokens)} targ={target_tokens[idx]}') pass # Average over number of tokens (with fractional weight). Gives 0-100 regardless of weighting and number of tokens if num_inp_tokens > 0: score = (score / num_inp_tokens) else: score = 0 return score + bonus
def handle_line(self, line_num, row): """ For each line in file, add item to alternate name DB if we support that language Also add to main DB if lang is not English and item is not an ADM item :param line_num: file line number :param row: line in file to be handled :return: None """ if self.search == None: # Create search instance self.search = GeoSearch.GeoSearch(self.geo_build.geodb) alt_tokens = row.split('\t', maxsplit=4) if len(alt_tokens) != 5: self.logger.debug( f'Incorrect number of tokens {len(alt_tokens)}: {alt_tokens} line {line_num}' ) return self.place.georow_list = [] if alt_tokens[ALT_LANG] == '': alt_tokens[ALT_LANG] = 'en' # Only add if lang is in requested lang list if alt_tokens[ALT_LANG] in self.lang_list: # Only Add this alias if DB has an entry for it (since geoname DB is filtered ) # Check Main DB - see if item has an entry with same GEOID dbid = self.geo_build.geoid_main_dict.get(alt_tokens[ALT_GEOID]) if dbid is not None: # Retrieve entry self.search.lookup_dbid(self.place.georow_list, dbid, place=self.place, admin=False) else: # Check Admin DB - see if item has an entry with same GEOID dbid = self.geo_build.geoid_admin_dict.get( alt_tokens[ALT_GEOID]) if dbid is not None: # Retrieve entry self.search.lookup_dbid(self.place.georow_list, dbid, place=self.place, admin=True) if len(self.place.georow_list) > 0: # Create an entry in the alternate name DB with this name and soundex # Convert row to list. modify name and soundex # Update the name in the new row with the alternate name update = list(self.place.georow_list[0][0:GeoSearch.Entry.SDX + 1]) # Make sure this entry has a different name from existing entry if update[ GeoSearch.Entry.NAME] != alt_tokens[ALT_NAME].lower(): self.geo_build.update_geo_row_name( geo_row=update, name=alt_tokens[ALT_NAME]) new_row = tuple(update) # Convert back to tuple if 'ADM1' not in update[ GeoSearch.Entry.FEAT] and 'ADM2' not in update[ GeoSearch.Entry.FEAT]: # Add to main DB if not English or not ADM1/ADM2 self.geo_build.insert( geo_tuple=new_row, feat_code=update[GeoSearch.Entry.FEAT]) self.count += 1 # Add name to altnames table self.geo_build.insert_alternate_name( alt_tokens[ALT_NAME], alt_tokens[ALT_GEOID], alt_tokens[ALT_LANG]) self.count += 1