def run_lookup(self, title: str, entry: str): # if title not in ['903']: # return 99.9, 'XX' print("*****TEST: {}".format(title)) match = TestGeodata.geodata.find_best_match(entry, self.place) #flags = TestGeodata.geodata.filter_results(self.place) # If multiple matches, truncate to first match lat = self.place.lat if len(self.place.georow_list) > 0: lat = self.place.georow_list[0][GeoUtil.Entry.LAT] self.place.georow_list = self.place.georow_list[:1] #TestGeodata.geodata.process_results(place=self.place, flags=flags) self.place.set_place_type() nm = f'{self.place.get_long_name(TestGeodata.geodata.geo_build.output_replace_dct)}' print( f'Found pre=[{self.place.prefix}{self.place.prefix_commas}] Nam=[{nm}]' ) return float(lat), GeoUtil.capwords( self.place.prefix) + self.place.prefix_commas + nm elif match: nm = f'{self.place.get_long_name(TestGeodata.geodata.geo_build.output_replace_dct)}' print( f'Found pre=[{self.place.prefix}{self.place.prefix_commas}] Nam=[{nm}]' ) return float(lat), GeoUtil.capwords( self.place.prefix) + self.place.prefix_commas + nm else: return float(lat), 'NO MATCH'
def get_command_line_arguments(self): parser = argparse.ArgumentParser() parser.add_argument("--logging", help="info - Enable quiet logging") parser.add_argument( "--diagnostics", help="on - Create xx.input.txt and xx.output.txt diagnostics files" ) parser.add_argument("--spellcheck", help="on - Enable spellchecker") # read arguments from the command line args = parser.parse_args() # check for --verbose switch if args.logging == 'info': self.logger = GeoUtil.set_info_logging('geofinder Init') else: self.logger = GeoUtil.set_debug_logging('geofinder Init') # check for --diagnostics switch if args.diagnostics == 'on': self.logger.info(f"--diagnostics files enabled {args.diagnostics}") self.diagnostics = True else: self.diagnostics = False # check for --spellcheck switch if args.spellcheck == 'on': self.logger.info(f"--spellchecking enabled {args.spellcheck}") self.enable_spell_checker = True else: self.enable_spell_checker = False
def setUpClass(cls): super().setUpClass() TestRunner.cases = TestRegex.cases TestRegex.phrase_rgx_keep_commas = GeoUtil.RegexList( no_punc_keep_commas + phrase_cleanup + noise_words) TestRegex.phrase_rgx_remove_commas = GeoUtil.RegexList( no_punc_remove_commas + phrase_cleanup + noise_words)
def get_directory_locations(self): home_path = str(Path.home()) self.directory = Path( os.path.join(home_path, str(GeoUtil.get_directory_name()))) self.ini_handler = IniHandler.IniHandler(base_path=home_path, ini_name='geofinder.ini') self.directory = self.ini_handler.get_directory_from_ini( "GeoFinder", GeoUtil.get_directory_name()) self.cache_dir = GeoUtil.get_cache_directory(self.directory) self.logger.info(f'Cache directory {self.cache_dir}')
def __init__(self): # Build compiled lists of regex statements that will be used for normalization # phrase_rgx_remove_commas - Combine phrase dictionary and no punctuation_remove_commas and compile regex self.phrase_rgx_remove_commas = GeoUtil.RegexList( no_punc_remove_commas + phrase_cleanup + noise_words) # phrase_rgx_keep_commas - Combine phrase dictionary and no punctuation_keep_commas and compile regex self.phrase_rgx_keep_commas = GeoUtil.RegexList(no_punc_keep_commas + phrase_cleanup + noise_words) # noise_rgx - Combine phrase dictionary with Noise words dictionary and compile regex (this is used for match scoring) self.noise_rgx = GeoUtil.RegexList(no_punc_keep_commas + phrase_cleanup + noise_words)
def __init__(self, directory): self.logger = logging.getLogger(__name__) self.config_cd: CachedDictionary self.config_cd = None self.directory: str = directory self.cache_dir = GeoUtil.get_cache_directory(self.directory)
def write_updated_place(self, place: Loc.Loc, original_entry): """ Write out this updated location and lat/lon to ancestry file output If place result_type was DELETE, do not write out location Write to diagnostic file as well if enabled #Args: place: Updated location original_entry: Original file entry """ self.geodata.geo_files.geodb.set_display_names(place) place.original_entry = place.get_long_name( self.geodata.geo_files.output_replace_dct) prefix = GeoUtil.capwords(self.place.prefix) if self.diagnostics: self.in_diag_file.write(f'{GeoUtil.capwords(original_entry)}\n') if place.result_type != GeoUtil.Result.DELETE: # self.logger.debug(f'Write Updated - name={place.name} pref=[{place.prefix}]') self.ancestry_file_handler.write_updated( prefix + place.prefix_commas + place.original_entry, place) self.ancestry_file_handler.write_lat_lon(lat=place.lat, lon=place.lon) text = prefix + place.prefix_commas + place.original_entry + '\n' # text = str(text.encode('utf-8', errors='replace')) if self.diagnostics: self.out_diag_file.write(text) else: # self.logger.debug('zero len, no output') if self.diagnostics: self.out_diag_file.write('DELETE\n')
def _calculate_prefix_penalty(prefix): # If the location has a prefix, it is not as good a match prefix_len = len(prefix) if prefix_len > 0: # reduce penalty if prefix is a street (contains digits or 'street' or 'road') penalty = 5 + prefix_len if GeoUtil.is_street(prefix): penalty *= 0.1 else: penalty = 0 return penalty
def __init__(self): self.logger = logging.getLogger(__name__) fmt = "%(levelname)s %(asctime)s %(name)s.%(funcName)s %(lineno)d: %(message)s" logging.basicConfig(level=logging.DEBUG, format=fmt) self.logger.info('Configuration') self.directory: str = os.path.join(str(Path.home()), GeoUtil.get_directory_name()) self.cache_dir = GeoUtil.get_cache_directory(self.directory) # Get configuration settings stored in config pickle file self.cfg: CachedDictionary.CachedDictionary = CachedDictionary.CachedDictionary( self.cache_dir, "config.pkl") if not os.path.exists(self.directory): self.logger.info(f'Creating main folder {self.directory}') os.makedirs(self.directory) if not os.path.exists(self.cache_dir): self.logger.info(f'Creating cache folder {self.cache_dir}') os.makedirs(self.cache_dir) self.cfg.read() # Verify config - test to see if gedcom file accessible self.get_config() # Create App window self.root = Tk() self.root["padx"] = 30 self.root["pady"] = 30 self.root.title('GeoUtil') UtilLayout.UtilLayout(root=self.root, directory=self.directory, cache_dir=self.cache_dir)
def fast_prefix(pref: str, result: str) -> str: """ Cleanup prefix. Remove any items from prefix that are in match result. Remove * #Args: pref: result: #Returns: Prefix with words removed """ return Loc.prefix_cleanup(pref, result) pref, res = GeoUtil.remove_matching_sequences(text1=pref, text2=result, min_len=2) return pref
def __init__(self, directory: str, filename: str, progress_bar, prefix, geo_build: GeodataBuild, lang_list): """ Read in geonames alternate names file and add to geodata database in alt_names table # Args: directory: base directory for alternate names file filename: filename of geonames alternate_namesV2.txt file progress_bar: tkhelper progress bar or None geo_files: GeodataFiles instance lang_list: List of ISO languages we want to support, e.g. ['fr', 'es'] """ super().__init__(directory, filename, progress_bar, prefix=prefix) self.sub_dir = GeoUtil.get_cache_directory(directory) self.geo_build: GeodataBuild.GeodataBuild = geo_build self.lang_list = lang_list self.place = Loc.Loc() self.search = None
def display_georow_list(self, place: Loc.Loc): """ Display list of matches in listbox (tree) """ # Clear listbox self.w.tree.clear_display_list() temp_place = copy.copy(place) # Get geodata for each item and add to listbox output for geo_row in place.georow_list: self.geodata.geo_files.geodb.copy_georow_to_place( geo_row, temp_place) temp_place.set_place_type() self.geodata.geo_files.geodb.set_display_names(temp_place) nm = temp_place.get_long_name( self.geodata.geo_files.output_replace_dct) # See if name existed at time of event valid_year = self.geodata._valid_year_for_location( event_year=place.event_year, country_iso=temp_place.country_iso, admin1=temp_place.admin1_id, pad_years=0) if valid_year: # Get prefix self.w.tree.list_insert( nm, GeoUtil.capwords(geo_row[GeoUtil.Entry.PREFIX]), geo_row[GeoUtil.Entry.ID], f'{int(geo_row[GeoUtil.Entry.SCORE]):d}', geo_row[GeoUtil.Entry.FEAT]) else: self.w.tree.list_insert( nm, "VERIFY DATE", geo_row[GeoUtil.Entry.ID], f'{int(geo_row[GeoUtil.Entry.SCORE]):d}', geo_row[GeoUtil.Entry.FEAT]) self.w.root.update_idletasks()
def parse_place(self, place_name: str, geo_db: GeoDB.GeoDB): """ Given a comma separated place name, parse into its city, admin1, country and type of entity (city, country etc) #Args: place_name: The place name to parse geo_files: GeodataBuild instance #Returns: Fields in Loc (city, adm1, adm2, iso) are updated based on parsing. self.status has Result status code """ self.geo_db = geo_db self.logger.debug(f'PARSE {place_name}\n') self.clear() self.original_entry = place_name # Convert open-brace and open-paren to comma. close brace/paren will be stripped by normalize() name = re.sub(r'\[', ',', place_name) name = re.sub(r'\(', ',', name) tokens = name.split(",") if len(tokens[-1]) == 0: # Last item is blank, so remove it tokens = tokens[:-1] token_count = len(tokens) self.place_type = PlaceType.CITY # First, try to parse and validate State/Province, and Country from last two tokens # If one other token, parse as city # If two other tokens, parse as city, admin2 # First two tokens are also copied to prefix. # Place type is the leftmost item we found - either City, Admin2, Admin2, or Country # If '--' in name, then extract advanced search options if '--' in place_name: # Advanced Search - Pull out filter flags if present self.logger.debug('filter') self.get_filter_parameters(place_name) return if token_count > 0: # COUNTRY - right-most token should be country self.country_name = self.norm.normalize(tokens[-1], False) # Validate country self.country_iso = geo_db.s.get_country_iso( self.country_name) # Get Country country_iso self.logger.debug( f'1) Lookup COUNTRY [{self.country_name}] Found ISO [{self.country_iso}] *******' ) if self.country_iso != '': self.place_type = PlaceType.COUNTRY self.result_type = GeoUtil.Result.PARTIAL_MATCH else: # Last token is not COUNTRY. # Append dummy token so we now have <tokens>, x tokens.append('_') token_count = len(tokens) self.result_type = GeoUtil.Result.NO_COUNTRY self.country_name = '' # self.logger.debug(f'ISO =[{self.country_iso}]') if token_count > 1: # See if 2nd to last token is Admin1 val = tokens[-2] self.logger.debug(f'Get ADM1 from tkn-2 [{val}]') self.admin1_name = self.norm.admin1_normalize( val, self.country_iso) if len(self.admin1_name) > 0: # Lookup Admin1 self.logger.debug( f'2) Find ADMIN1 [{self.admin1_name}] *******') row_list = [] self.admin1_id = geo_db.s.get_admin1_id( self.admin1_name, self.country_iso) if self.admin1_id != '': # Found Admin1 self.place_type = PlaceType.ADMIN1 self.georow_list = row_list self.admin1_name = geo_db.s.get_admin1_name( self.admin1_id, self.country_iso) # self.logger.debug(f'adm1 nm=[{self.admin1_name}]\nGet ISO') self.logger.debug( f'2) Find iso for admin1 id [{self.admin1_id}] *******' ) self.country_iso = geo_db.s.get_iso_from_admin1_id( self.admin1_id, self.country_iso) self.result_type = GeoUtil.Result.PARTIAL_MATCH # Get country if blank row_list = [] if self.country_name == '': self.country_name = geo_db.s.get_country_name( self.country_iso) else: # Last token is not Admin1 - append dummy token so we have <tokens>, admin1, country self.admin1_name = '' # Add dummy token for admin1 position tokens.insert(-1, '_') # token_count = len(tokens) else: tokens[-2] = '_' # Last two tokens are now Admin1, Country (although they may have dummy value '_') # If >2 tokens: Put first non-blank token in City and in Prefix # If >3 tokens: Put second non-blank token in Admin2 and also append to Prefix # Remove all blank tokens tokens = [x for x in tokens if x] token_count = len(tokens) if token_count >= 3: # Possible Formats: City, Admin1, Country or Admin2, Admin1, Country # Take first tkn as city self.city = self.norm.normalize(tokens[0], False) self.place_type = PlaceType.CITY # Also place token[0] into Prefix if '*' not in tokens[0]: self.prefix = str(tokens[0].strip(' ')) if token_count >= 4: # Admin2 is 2nd. Note - if Admin2 isnt found, it will look it up as city if GeoUtil.is_street(tokens[-4].lower()): # Format: Prefix, City, Admin1, Country self.city = self.norm.normalize(tokens[-3], False) else: # Format: City, Admin2, Admin1, Country self.admin2_name = self.norm.normalize(tokens[-3], False) self.city = self.norm.normalize(tokens[-4], False) self.place_type = PlaceType.CITY # put token[0] and token[1] into Prefix if '*' not in tokens[1]: self.prefix = str(tokens[0].strip(' ')) + ' ' + str( tokens[1].strip(' ')) self.prefix = self.norm.normalize(self.prefix, False) row_list = [] # fill in country name if still missing - finding Admin1 will find country ISO if self.country_name == '' and self.country_iso != '': self.country_name = geo_db.s.get_country_name(self.country_iso) self.logger.debug( f" ======= PARSED: {place_name} \nCity [{self.city}] Adm2 [{self.admin2_name}]" f" Adm1 [{self.admin1_name}] adm1_id [{self.admin1_id}] Cntry [{self.country_name}] Pref=[{self.prefix}]" f" type_id={self.place_type}\n") return
def remove_matches(out, inp): out, inp = GeoUtil.remove_matching_sequences(text1=out, text2=inp, min_len=2) return out, inp
def __init__(self, directory: str, display_progress, show_message: bool, exit_on_error: bool, languages_list_dct: {}, feature_code_list_dct: {}, supported_countries_dct: {}, volume=''): """ Read in datafiles needed for geodata, filter them and create a sql db. Filter dictionary examples: languages_list_dct={'fr','de'} feature_code_list_dct={'PPL', 'ADM1', 'CSTL'} supported_countries_dct = {'us','gb','at'} # Args: directory: base directory display_progress: None or Handler called with percent_done:int, msg:str show_message: True to show message boxes to user on errors exit_on_error: True to exit on serious errors languages_list_dct: dictionary containing the ISO-2 languages to load from alternateNames feature_code_list_dct: dictionary containing the Geonames.org feature codes to load supported_countries_dct: dictionary containing the ISO-2 countries to load volume: disk volume to use - e.g. C: for Windows or /Volumes/xyz for OSX, /media/xyz for linux """ self.logger = logging.getLogger(__name__) self.geodb: [GeoDB.GeoDB, None] = None self.show_message = show_message self.geoid_main_dict = {} # Key is GEOID, Value is DB ID for entry self.geoid_admin_dict = {} # Key is GEOID, Value is DB ID for entry # TODO fix volume handling self.volume = volume self.collate = 'COLLATE NOCASE' self.exit_on_error = exit_on_error self.required_db_version = 4 # Message to user upgrading from earlier DB version self.db_upgrade_text = 'Renamed column to Feature' self.directory: str = directory self.progress_bar = display_progress self.line_num = 0 self.cache_changed: bool = False sub_dir = GeoUtil.get_cache_directory(self.directory) self.country = None self.languages_list_dct = languages_list_dct self.feature_code_list_dct = feature_code_list_dct self.supported_countries_dct = supported_countries_dct self.lang_list = [] self.norm = Normalize.Normalize() for item in self.languages_list_dct: self.lang_list.append(item) if volume != '': os.chdir(volume) if not os.path.exists(sub_dir): self.logger.warning(f'Directory] {sub_dir} NOT FOUND') if self.show_message: messagebox.showwarning( 'Folder not found', f'Directory\n\n {sub_dir}\n\n NOT FOUND') if exit_on_error: sys.exit() # Read in Text Replacement dictionary pickle - this has output text replacements self.output_replace_cd = CachedDictionary.CachedDictionary( sub_dir, "output_list.pkl") self.output_replace_cd.read() self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict self.output_replace_list = [] for item in self.output_replace_dct: self.output_replace_list.append(item) self.entry_place = Loc.Loc() # Support for Geonames AlternateNames file. Adds alternate names for entries self.alternate_names = AlternateNames.AlternateNames( directory=self.directory, geo_build=self, progress_bar=self.progress_bar, prefix="Step 3 of 4) ", filename='alternateNamesV2.txt', lang_list=self.lang_list)
def __init__(self, frame, title, dir_name, cache_filename, error): self.logger = logging.getLogger(__name__) self.file_error = True self.title = title self.frame = frame self.separator = ":" self.dirty_flag = False # Flag to track if data was modified self.error = error # Load in list from cache file self.directory = dir_name self.cache_dir = GeoUtil.get_cache_directory(dir_name) self.logger.debug( f'SetupStatusFrame dir={dir_name} sub_dir={self.cache_dir} file={cache_filename}' ) self.cache = CachedDictionary.CachedDictionary(self.cache_dir, cache_filename) self.cache.read() self.error_dict = {} # Keep a dictionary of errors self.supported_countries_cd = CachedDictionary.CachedDictionary( self.cache_dir, "country_list.pkl") self.supported_countries_cd.read() self.supported_countries_dct: Dict[ str, str] = self.supported_countries_cd.dict self.logger.debug( f'country list len={len(self.supported_countries_dct)}') self.grd = { "title_label": [0, 0, 5, 5, "W"], "scrollbar": [1, 2, 0, 5, "WNS"], "status": [0, 1, 5, 5, "W"], "add_button": [2, 4, 5, 5, "W"], "listbox": [0, 2, 5, 5, "E"], "unused": [2, 3, 5, 5, "W"], "add_entry": [0, 4, 5, 5, "W"], "load_button": [2, 1, 5, 5, "W"], "geoname_button": [2, 1, 5, 5, "E"], "add_label": [0, 3, 5, 5, "EW"] } self.title_label = Widge.CLabel(frame, text=self.title, width=80, style='Info.TLabel') self.status = Widge.CLabel(frame, text=" ", width=80, style='Highlight.TLabel') self.scrollbar = Scrollbar(frame) self.listbox = Listbox(frame, width=80, height=20, bg=AppStyle.LT_GRAY, selectmode=MULTIPLE, yscrollcommand=self.scrollbar.set) self.add_button = ttk.Button(frame, text="geonames.org", command=self.web_handler, width=12) # Configure buttons and widgets self.configure_widgets() #self.frame.columnconfigure(0, weight=5) #self.frame.columnconfigure(2, weight=2) #self.frame.rowconfigure(0, weight=2) #self.frame.rowconfigure(1, weight=2) # Display data self.load_handler()
def update_names(self, dct): self.logger.debug(f'pref bef=[{self.prefix}]') prfx = self.prefix_cleanup(self.prefix, self.get_long_name(dct)) self.updated_entry = GeoUtil.capwords(prfx) + self.get_long_name(dct) self.logger.debug(f'updated_entry=[{self.updated_entry}]')
def open_geodb(self, repair_database: bool, query_limit: int) -> bool: """ Open Geoname DB file - this is the db of geoname.org city files and is stored in cache directory under geonames_data. The db only contains important fields and only for supported countries. If the db doesn't exist and repair flag is True, read the geonames.org files and build DB. The DB has a version table for the schema version. If the schema changes, the version should be updated. This will check DB schema version and rebuild DB if version is out of date. # Args: repair_database: If True, rebuild database if error or missing Returns: True if error """ # Use db if it exists and has data and is correct version cache_dir = GeoUtil.get_cache_directory(self.directory) db_path = os.path.join(cache_dir, 'geodata.db') self.logger.debug(f'path for geodata.db: {db_path}') err_msg = '' # Validate Database setup if os.path.exists(db_path): # DB was Found self.logger.debug(f'DB found at {db_path}') self.geodb = GeoDB.GeoDB(db_path=db_path, show_message=self.show_message, exit_on_error=self.exit_on_error, set_speed_pragmas=True, db_limit=query_limit) # Make sure DB is correct version ver = self.geodb.get_db_version() if ver != self.required_db_version: # Bad DB version if ver == DB_REBUILDING: # DB didn't complete rebuild err_msg = f'Database only partially built. Deleting and will rebuild on next startup' else: # DB is out of date err_msg = f'Database version will be upgraded:\n\n{self.db_upgrade_text}\n\n' \ f'Upgrading database from V {ver} to V {self.required_db_version}.' self.geodb.close() os.remove(db_path) self.logger.info(err_msg) if self.show_message: messagebox.showinfo( 'Database Deleted. Will rebuild on start up', err_msg) sys.exit() else: err_msg = f'Database not found at\n\n{db_path}.\n\nBuilding DB' self.logger.debug(f'{err_msg}') if err_msg == '': pass # No DB errors detected #count = self.geodb.get_row_count() #self.logger.info(f'Geoname database has {count:,} entries\n' # f'------------------------------------------------------------\n') else: # DB error detected - rebuild database if flag set if self.show_message: messagebox.showinfo('Database Error', err_msg) self.logger.debug(err_msg) if repair_database: if os.path.exists(db_path): self.geodb.close() os.remove(db_path) self.logger.info('Database deleted') if self.show_message: messagebox.showinfo( 'Database Deleted. Will rebuild on start up', err_msg) self.geodb = GeoDB.GeoDB(db_path=db_path, show_message=self.show_message, exit_on_error=self.exit_on_error, set_speed_pragmas=True, db_limit=query_limit) return self.create_geonames_database() return False