def __init__(self): """ Init """ self.logger = logging.getLogger(__name__) self.original_entry: str = "" self.lat: float = float('NaN') # Latitude self.lon: float = float('NaN') # Longitude self.country_iso: str = "" # Country ISO code self.country_name: str = '' self.city: str = "" # City or entity name self.admin1_name: str = "" # Admin1 (State/province/etc) self.admin1_id: str = "" # Admin1 Geoname ID self.admin2_name: str = "" # Admin2 (county) self.admin2_id = "" # Admin2 Geoname ID self.prefix: str = "" # Prefix (entries prepended before geoname location) self.feature: str = '' # Geoname feature code self.place_type: int = PlaceType.COUNTRY # Is this a Country , Admin1 ,admin2 or city? self.geoid: str = '' # Geoname GEOID self.enclosed_by = '' # The entity that encloses this. E.g United States encloses Texas self.updated_entry = '' self.score = 100.0 self.norm = Normalize.Normalize() # Lookup result info self.status: str = "" self.status_detail: str = "" self.result_type: int = GeoUtil.Result.NO_MATCH # Result type of lookup self.result_type_text: str = '' # Text version of result type self.georow_list: List = [] # List of items that matched this location self.event_year: int = 0 self.geo_db = None
def update_global_replacement_list(self, key, geoid, prefix): res = ReplacementDictionary.build_replacement_entry(geoid, prefix) ky = Normalize.normalize(text=key, remove_commas=False) self.global_replace.set(ky, res) # Periodically flush dictionary to disk if self.update_counter % 10 == 1: self.global_replace.write()
def __init__(self, geodb): self.logger = logging.getLogger(__name__) self.detailed_debug = True self.start = 0 self.use_wildcards = True self.total_lookups = 0 self.cache = {} self.place_type = '' self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx' self.geodb = geodb self.match = MatchScore.MatchScore() self.norm = Normalize.Normalize() self.place = Loc.Loc()
def get_soundex(text) -> str: """ Returns: Phonetics Double Metaphone Soundex code for sorted words in text Words are alpha sorted but stop words are forced to end First two actual letters of word are prepended """ sdx = [] word_list = Normalize.sorted_normalize(text) for word in word_list: sdx.append(get_word_soundex(word)) res = ' '.join(sdx) if len(res) == 0: res = text return res.lower()
def __init__(self): self.logger = logging.getLogger(__name__) self.score_diags = '' # Diagnostic text for scoring self.token_weight = [] self.prefix_weight = 0.0 self.feature_weight = 0.0 self.input_weight = 0.0 # Weighting for each input term match - adm2, adm1, country token_weights = [.2, .3, .5] self.set_weighting(token_weight=token_weights, prefix_weight=6.0, feature_weight=0.15) # Weighting for each part of score self.wildcard_penalty = 8.0 self.norm = Normalize.Normalize()
def __init__(self, db_path, show_message: bool, exit_on_error: bool, set_speed_pragmas: bool, db_limit: int): """ geoname data database init. Open database if present otherwise raise error # Args: db_path: full path to database file show_message: If True, show messagebox to user on error exit_on_error: If True, exit if significant error occurs set_speed_pragmas: If True, set DB pragmas for maximum performance. db_limit: SQL LIMIT parameter # Raises: ValueError('Cannot open database'), ValueError('Database empty or corrupt') """ self.logger = logging.getLogger(__name__) self.show_message = show_message self.exit_on_error = exit_on_error self.max_query_results = 50 self.total_time = 0 self.total_lookups = 0 self.slow_lookup = 0 self.match = MatchScore.MatchScore() self.norm = Normalize.Normalize() #self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx' self.db_path = db_path # See if DB exists if os.path.exists(db_path): db_existed = True else: db_existed = False self.db = DB.DB(db_filename=db_path, show_message=show_message, exit_on_error=exit_on_error) if self.db.err != '': self.logger.error(f"Error! cannot open database {db_path}.") raise ValueError('Cannot open database') # If DB was initially found if db_existed: # Run sanity test on DB err = self.db.test_database('name', 'main.geodata', where='name = ? AND country = ?', args=('ba', 'fr')) if err: # DB failed sanity test self.logger.warning(f'DB error for {db_path}') if show_message: if messagebox.askyesno( 'Error', f'Geoname database is empty or corrupt:\n\n {db_path} \n\nDo you want to delete it and rebuild?' ): messagebox.showinfo('', 'Deleting Geoname database') self.db.conn.close() os.remove(db_path) if exit_on_error: sys.exit() else: raise ValueError('Database empty or corrupt') if set_speed_pragmas: self.db.set_speed_pragmas() self.db_limit = db_limit self.db.order_string = '' self.db.limit_string = f'LIMIT {self.db_limit}' self.place_type = '' self.s: GeoSearch.GeoSearch = GeoSearch.GeoSearch(geodb=self)
def __init__(self, directory: str, display_progress, show_message: bool, exit_on_error: bool, languages_list_dct: {}, feature_code_list_dct: {}, supported_countries_dct: {}, volume=''): """ Read in datafiles needed for geodata, filter them and create a sql db. Filter dictionary examples: languages_list_dct={'fr','de'} feature_code_list_dct={'PPL', 'ADM1', 'CSTL'} supported_countries_dct = {'us','gb','at'} # Args: directory: base directory display_progress: None or Handler called with percent_done:int, msg:str show_message: True to show message boxes to user on errors exit_on_error: True to exit on serious errors languages_list_dct: dictionary containing the ISO-2 languages to load from alternateNames feature_code_list_dct: dictionary containing the Geonames.org feature codes to load supported_countries_dct: dictionary containing the ISO-2 countries to load volume: disk volume to use - e.g. C: for Windows or /Volumes/xyz for OSX, /media/xyz for linux """ self.logger = logging.getLogger(__name__) self.geodb: [GeoDB.GeoDB, None] = None self.show_message = show_message self.geoid_main_dict = {} # Key is GEOID, Value is DB ID for entry self.geoid_admin_dict = {} # Key is GEOID, Value is DB ID for entry # TODO fix volume handling self.volume = volume self.collate = 'COLLATE NOCASE' self.exit_on_error = exit_on_error self.required_db_version = 4 # Message to user upgrading from earlier DB version self.db_upgrade_text = 'Renamed column to Feature' self.directory: str = directory self.progress_bar = display_progress self.line_num = 0 self.cache_changed: bool = False sub_dir = GeoUtil.get_cache_directory(self.directory) self.country = None self.languages_list_dct = languages_list_dct self.feature_code_list_dct = feature_code_list_dct self.supported_countries_dct = supported_countries_dct self.lang_list = [] self.norm = Normalize.Normalize() for item in self.languages_list_dct: self.lang_list.append(item) if volume != '': os.chdir(volume) if not os.path.exists(sub_dir): self.logger.warning(f'Directory] {sub_dir} NOT FOUND') if self.show_message: messagebox.showwarning( 'Folder not found', f'Directory\n\n {sub_dir}\n\n NOT FOUND') if exit_on_error: sys.exit() # Read in Text Replacement dictionary pickle - this has output text replacements self.output_replace_cd = CachedDictionary.CachedDictionary( sub_dir, "output_list.pkl") self.output_replace_cd.read() self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict self.output_replace_list = [] for item in self.output_replace_dct: self.output_replace_list.append(item) self.entry_place = Loc.Loc() # Support for Geonames AlternateNames file. Adds alternate names for entries self.alternate_names = AlternateNames.AlternateNames( directory=self.directory, geo_build=self, progress_bar=self.progress_bar, prefix="Step 3 of 4) ", filename='alternateNamesV2.txt', lang_list=self.lang_list)
def process_place_entries(self): """ Handle PLACE entries in users file. Replace it, skip it, or have user correct it. """ self.w.original_entry.text = "" if self.w.prog.shutdown_requested: self.periodic_update("Shutting down...") else: self.periodic_update("Scanning") self.clear_result_list(self.place) while True: # Keep reading place entries until we need User review or reach End Of File self.update_counter += 1 # Counter is used to periodically update # Update statistics self.update_statistics() # Find the next PLACE entry in file # Process it and keep looping until we need user review self.place.clear() town_entry, eof, rec_id = self.ancestry_file_handler.get_next_place( ) self.place.updated_entry = town_entry self.place.id = rec_id town_entry = Normalize.normalize(text=town_entry, remove_commas=False) if eof: self.end_of_file_shutdown() # See if we already have a fix (Global Replace) or Skip (ignore). # Otherwise see if we can find it or have user handle it replacement_geoid = self.get_replacement(self.global_replace, town_entry, self.place) if replacement_geoid is not None: # There is already a global change that we can apply to this entry. self.matched_count += 1 if self.place.result_type == GeoUtil.Result.STRONG_MATCH: # REPLACE - Output the updated place to ancestry file self.write_updated_place(self.place, town_entry) # Display status to user if self.w.prog.shutdown_requested: self.periodic_update("Creating Import...") else: self.periodic_update("Applying change") elif self.place.result_type == GeoUtil.Result.DELETE: # DELETE - Don't write out this place continue else: # ERROR - We previously found an update, but the GEOID for replacement can no longer be found self.logger.warning( f'***ERROR looking up GEOID=[{replacement_geoid}] for [{town_entry}] ' ) self.place.event_year = int( self.ancestry_file_handler.event_year ) # Set place date to event date (geo names change over time) self.w.original_entry.text = f'** DATABASE ERROR FOR GEOID=[{replacement_geoid}] for [{town_entry}]' self.w.user_entry.text = f'{town_entry}' self.geodata.find_matches(town_entry, self.place, self.w.prog.shutdown_requested) break continue elif self.skiplist.get(town_entry) is not None: # SKIP - User marked place as SKIP - Write out as-is and go to next error self.skip_count += 1 self.periodic_update("Skipping") self.ancestry_file_handler.write_asis(town_entry) continue else: # FOUND a PLACE entry that we don't already have a global replace or skip for # See if it is in the place database self.place.event_year = int( self.ancestry_file_handler.event_year ) # Set place date to event date (geo names change over time) self.geodata.find_matches(town_entry, self.place, self.w.prog.shutdown_requested) if self.place.result_type in GeoUtil.successful_match: # STRONG MATCH if self.place.result_type == GeoUtil.Result.STRONG_MATCH: # FOUND STRONG MATCH - no user verification needed self.matched_count += 1 # Write out line without user verification if self.w.prog.shutdown_requested: self.periodic_update("Creating Import...") else: self.periodic_update("Scanning") # Add to global replace list self.update_global_replacement_list( key=town_entry, geoid=self.place.geoid, prefix=self.place.prefix) self.write_updated_place(self.place, town_entry) self.logger.debug( f'Found Strong Match for {town_entry} Setting DICT' ) continue else: # WEAK MATCH OR MULTIPLE MATCHES if self.w.prog.shutdown_requested: # User requested shutdown - so no user interaction. Write this item out as-is self.review_count += 1 self.periodic_update("Creating Import...") self.w.original_entry.text = " " self.ancestry_file_handler.write_asis(town_entry) continue else: # USER REVIEW - Have user review the match self.logger.debug( f'user review for {town_entry} res= [{self.place.result_type}] ' ) self.w.status.configure(style="Good.TLabel") self.w.original_entry.text = self.place.original_entry # Display place self.w.user_entry.text = self.place.updated_entry # Display place # Break out of loop and have user review the match break else: # NO MATCH FOUND if self.w.prog.shutdown_requested: # User requested shutdown. Write this item out as-is self.review_count += 1 self.periodic_update("Creating Import...") self.w.original_entry.text = " " self.ancestry_file_handler.write_asis(town_entry) continue else: # USER REVIEW - Have user review entry # self.logger.debug(f'User2 review for {town_entry}. status ={self.place.status}') self.w.status.configure(style="Good.TLabel") self.w.original_entry.text = self.place.original_entry # Display place self.w.user_entry.text = self.place.original_entry # Display place # Break out of loop and have user review the item break # Have user review the result self.display_result(self.place)
def load_data_files(self) -> bool: """ Load in data files required for GeoFinder: Load global_replace dictionary, Geodata files and geonames #Returns: Error - True if error occurred """ # Read in Skiplist, Replace list self.skiplist = CachedDictionary.CachedDictionary( self.cache_dir, "skiplist.pkl") self.skiplist.read() self.global_replace = CachedDictionary.CachedDictionary( self.cache_dir, "global_replace.pkl") self.global_replace.read() dict_copy = copy.copy(self.global_replace.dict) # Convert all global_replace items to lowercase for ky in dict_copy: val = self.global_replace.dict.pop(ky) new_key = Normalize.normalize(text=ky, remove_commas=False) self.global_replace.dict[new_key] = val # Read in dictionary listing Geoname features we should include self.feature_code_list_cd = CachedDictionary.CachedDictionary( self.cache_dir, "feature_list.pkl") self.feature_code_list_cd.read() feature_code_list_dct: Dict[str, str] = self.feature_code_list_cd.dict if len(feature_code_list_dct) < 3: self.logger.warning('Feature list is empty.') feature_code_list_dct.clear() feature_list = UtilFeatureFrame.default for feat in feature_list: feature_code_list_dct[feat] = '' self.feature_code_list_cd.write() # Read in dictionary containing countries (ISO2) we should include self.supported_countries_cd = CachedDictionary.CachedDictionary( self.cache_dir, "country_list.pkl") self.supported_countries_cd.read() supported_countries_dct: Dict[str, str] = self.supported_countries_cd.dict # Read in dictionary containing languages (ISO2) we should include self.languages_list_cd = CachedDictionary.CachedDictionary( self.cache_dir, "languages_list.pkl") self.languages_list_cd.read() languages_list_dct: Dict[str, str] = self.languages_list_cd.dict # Initialize geo data self.geodata = Geodata(directory_name=self.directory, progress_bar=self.w.prog, enable_spell_checker=self.enable_spell_checker, show_message=True, exit_on_error=True, languages_list_dct=languages_list_dct, feature_code_list_dct=feature_code_list_dct, supported_countries_dct=supported_countries_dct) # If the list of supported countries is unusually short, display note to user num = self.display_country_note() self.logger.info('{} countries will be loaded'.format(num)) # open Geoname Gazeteer DB - city names, lat/long, etc. error = self.geodata.open() if error: TKHelper.fatal_error(MISSING_FILES) self.w.root.update() self.w.prog.update_progress(100, " ") return error