def __init__(self): self.logger = logging.getLogger(__name__) fmt = "%(levelname)s %(asctime)s %(name)s.%(funcName)s %(lineno)d: %(message)s" logging.basicConfig(level=logging.DEBUG, format=fmt) self.logger.info('Configuration') self.directory: str = os.path.join(str(Path.home()), GeoKeys.get_directory_name()) self.cache_dir = GeoKeys.get_cache_directory() # Get configuration settings stored in config pickle file self.cfg: CachedDictionary.CachedDictionary = CachedDictionary.CachedDictionary(self.cache_dir, "config.pkl") if not os.path.exists(self.directory): self.logger.info(f'Creating main folder {self.directory}') os.makedirs(self.directory) if not os.path.exists(self.cache_dir): self.logger.info(f'Creating cache folder {self.cache_dir}') os.makedirs(self.cache_dir) self.cfg.read() # Verify config - test to see if gedcom file accessible self.get_config() # Create App window self.root = Tk() self.root["padx"] = 30 self.root["pady"] = 30 self.root.title('GeoUtil') UtilLayout.UtilLayout(root=self.root, directory=self.directory, cache_dir=self.cache_dir)
def insert_georow(self, geoname_row): # Create Geo_row and inses # ('paris', 'fr', '07', '012', 12.345, 45.123, 'PPL', '34124') geo_row = [None] * GeoDB.Entry.MAX geo_row[GeoDB.Entry.NAME] = GeoKeys.normalize(geoname_row.name) geo_row[GeoDB.Entry.SDX] = GeoKeys.get_soundex( geo_row[GeoDB.Entry.NAME]) geo_row[GeoDB.Entry.ISO] = geoname_row.iso.lower() geo_row[GeoDB.Entry.ADM1] = geoname_row.admin1_id geo_row[GeoDB.Entry.ADM2] = geoname_row.admin2_id geo_row[GeoDB.Entry.LAT] = geoname_row.lat geo_row[GeoDB.Entry.LON] = geoname_row.lon geo_row[GeoDB.Entry.FEAT] = geoname_row.feat_code geo_row[GeoDB.Entry.ID] = geoname_row.id if int(geoname_row.pop) > 1000000 and 'PP' in geoname_row.feat_code: geo_row[GeoDB.Entry.FEAT] = 'PP1M' elif int(geoname_row.pop) > 100000 and 'PP' in geoname_row.feat_code: geo_row[GeoDB.Entry.FEAT] = 'P1HK' elif int(geoname_row.pop) < 10000 and 'PP' in geoname_row.feat_code: geo_row[GeoDB.Entry.FEAT] = 'PPLL' #if geoname_row.feat_code == 'PPLQ': # geo_row[GeoDB.Entry.NAME] = re.sub(r' historical', '', geo_row[GeoDB.Entry.NAME]) self.geodb.insert(geo_row=geo_row, feat_code=geoname_row.feat_code) # Also add abbreviations for USA states if geo_row[ GeoDB.Entry.ISO] == 'us' and geoname_row.feat_code == 'ADM1': geo_row[GeoDB.Entry.NAME] = geo_row[GeoDB.Entry.ADM1].lower() self.geodb.insert(geo_row=geo_row, feat_code=geoname_row.feat_code)
def __init__(self, directory_name: str, filename: str, progress_bar, geo_files: GeodataFiles, lang_list): super().__init__(directory_name, filename, progress_bar) self.sub_dir = GeoKeys.get_cache_directory(directory_name) self.geo_files: GeodataFiles.GeodataFiles = geo_files self.lang_list = lang_list self.loc = Loc()
def get_country_iso(self, place: Loc) -> str: """ Return ISO code for specified country""" lookup_target, modified = GeoKeys.country_normalize(place.country_name) if len(lookup_target) == 0: return '' # Try each query until we find a match - each query gets less exact query_list = [ Query(where="name = ? AND f_code = ? ", args=(lookup_target, 'ADM0'), result=Result.STRONG_MATCH), # Query(where="name LIKE ? AND f_code = ? ", # args=(self.create_wildcard(lookup_target), 'ADM0'), # result=Result.PARTIAL_MATCH) #, # Query(where="sdx = ? AND f_code = ? ", # args=(GeoKeys.get_soundex (lookup_target), 'ADM0'), # result=Result.PARTIAL_MATCH) ] row_list, result_code = self.db.process_query_list( from_tbl='main.admin', query_list=query_list) if len(row_list) > 0: res = row_list[0][Entry.ISO] if len(row_list) == 1: place.country_name = row_list[0][Entry.NAME] else: res = '' return res
def format_full_nm(self, replace_dct): """ Take the parts of a Place and build fullname. e.g. pref, city,adm2,adm1,country name """ self.set_place_type() if self.admin1_name is None: self.admin1_name = '' if self.admin2_name is None: self.admin2_name = '' if self.place_type == PlaceType.COUNTRY: nm = f"{self.country_name}" elif self.place_type == PlaceType.ADMIN1: nm = f"{self.admin1_name}, {self.country_name}" elif self.place_type == PlaceType.ADMIN2: nm = f"{self.admin2_name}, {self.admin1_name}, {self.country_name}" else: nm = f"{self.city1}, {self.admin2_name}, {self.admin1_name}, {str(self.country_name)}" if self.prefix in nm: self.prefix = '' if len(self.prefix) > 0: self.prefix_commas = ', ' else: self.prefix_commas = '' nm = GeoKeys.capwords(nm) # Perform any text replacements user entered into Output Tab if replace_dct: for key in replace_dct: nm = re.sub(key, replace_dct[key], nm) return nm
def filter(self, place_name, geo_files): # Advanced search parameters # Separate out arguments tokens = place_name.split(",") args = [] for tkn in tokens: if '--' in tkn: args.append(tkn.strip(' ')) # Parse options in place name parser = ArgumentParserNoExit(description="Parses command.") parser.add_argument("-f", "--feature", help=argparse.SUPPRESS) parser.add_argument("-i", "--iso", help=argparse.SUPPRESS) parser.add_argument("-c", "--country", help=argparse.SUPPRESS) try: options = parser.parse_args(args) self.city1 = GeoKeys.search_normalize(tokens[0], self.country_iso) self.target = self.city1 if options.iso: self.country_iso = options.iso.lower() if options.country: self.country_iso = options.country.lower() if options.feature: self.feature = options.feature.upper() self.place_type = PlaceType.ADVANCED_SEARCH except Exception as e: self.logger.debug(e) self.logger.debug( f'ADV SEARCH: targ={self.city1} iso={self.country_iso} feat={self.feature} typ={self.place_type}' )
def __init__(self, directory): self.logger = logging.getLogger(__name__) self.config_cd: CachedDictionary self.config_cd = None self.directory: str = directory self.cache_dir = GeoKeys.get_cache_directory(self.directory)
def handle_line(self, line_num, row): # This is called as each line is read alt_tokens = row.split('\t') if len(alt_tokens) != 10: self.logger.debug( f'Incorrect number of tokens: {alt_tokens} line {line_num}') return self.loc.georow_list = [] # Alternate names are in multiple languages. Only add if item is in requested lang list if alt_tokens[ALT_LANG] in self.lang_list: # Add this alias to geoname db if there is already an entry (geoname DB is filtered based on feature) # See if item has an entry with same GEOID in Main DB dbid = self.geo_files.geodb.geoid_main_dict.get( alt_tokens[ALT_GEOID]) if dbid is not None: self.loc.target = dbid self.geo_files.geodb.lookup_main_dbid(place=self.loc) else: # See if item has an entry with same GEOID in Admin DB dbid = self.geo_files.geodb.geoid_admin_dict.get( alt_tokens[ALT_GEOID]) if dbid is not None: self.loc.target = dbid self.geo_files.geodb.lookup_admin_dbid(place=self.loc) if len(self.loc.georow_list) > 0: # convert to list and modify name and add to DB and its soundex lst = list(self.loc.georow_list[0]) del lst[-1] lst[GeoDB.Entry.NAME] = GeoKeys.normalize(alt_tokens[ALT_NAME]) lst.append(GeoKeys.get_soundex(alt_tokens[ALT_NAME])) new_row = tuple(lst) # Convert back to tuple if alt_tokens[ALT_LANG] != 'en' or 'ADM' not in lst[ GeoDB.Entry.FEAT]: # Only add if not English or not ADM1/ADM2 self.geo_files.geodb.insert( geo_row=new_row, feat_code=lst[GeoDB.Entry.FEAT]) self.count += 1 # Add name to altnames table if alt_tokens[ALT_LANG] != 'en': self.geo_files.geodb.insert_alternate_name( alt_tokens[ALT_NAME], alt_tokens[ALT_GEOID], alt_tokens[ALT_LANG])
def update_rowlist_prefix(self, place: Loc.Loc): """ Set all the prefix values in the georow_list :param place: """ temp_place = Loc.Loc() tokens = place.original_entry.split(',') for idx, rw in enumerate(place.georow_list): update = list(rw) # Put unused fields into prefix self.geo_files.geodb.copy_georow_to_place(rw, temp_place) temp_place.prefix = '' nm = GeoKeys.search_normalize( temp_place.format_full_nm(self.geo_files.output_replace_dct), place.country_iso) # self.logger.debug(f'NAME ={nm}') place.prefix = '' for num, fld in enumerate(tokens[:2]): item = GeoKeys.search_normalize(fld, place.country_iso) add_item = False # self.logger.debug(f'item={item} ') if num == 0 and item not in nm: add_item = True if num == 1 and item not in nm and len(tokens) == 2: # We only add the second token if there are only 2 tokens add_item = True if '*' in item: # Don't add as prefix if item is a wildcard search add_item = False if add_item: if len(place.prefix) > 0: place.prefix += ' ' place.prefix += item.title() if len(place.prefix) > 0: place.prefix_commas = ', ' update[GeoKeys.Entry.PREFIX] = place.prefix # self.logger.debug(f'PREFIX={place.prefix} ') place.georow_list[idx] = tuple(update)
def output_row(self, row): if len(row[CSVEntry.ENCLOSED_BY]) > 0: enc = f'[{row[CSVEntry.ENCLOSED_BY]}]' else: enc = '' if self.csv_path is not None: # 0Place (ID), 1Title, 2Name, 3Type, 4latitude, 5longitude, 6enclosed_by title = GeoKeys.capwords(row[CSVEntry.TITLE]) name = GeoKeys.capwords(row[CSVEntry.NAME]) if math.isnan(float(row[CSVEntry.LAT])) or math.isnan( float(row[CSVEntry.LAT])): self.csvfile.write( f'[{row[CSVEntry.PLACE_ID]}],"{title}","{name}",{row[CSVEntry.TYPE]},' f' , ,{enc},\n') else: self.csvfile.write( f'[{row[CSVEntry.PLACE_ID]}],"{title}","{name}",{row[CSVEntry.TYPE]},' f'{row[CSVEntry.LAT]},{row[CSVEntry.LON]},{enc},\n')
def get_directory_from_ini(self) -> str: if self.ini_path.is_file(): val = self.ini_read('PATH', 'DIRECTORY') if val: self.directory = val else: # Not Found. Create INI file self.directory = Path( os.path.join(str(self.home_path), GeoKeys.get_directory_name())) self.ini_add_section('PATH') self.ini_set(section='PATH', key='DIRECTORY', val=str(self.directory)) else: # Not Found. Create INI file self.directory = Path( os.path.join(str(self.home_path), GeoKeys.get_directory_name())) self.ini_add_section('PATH') self.ini_set(section='PATH', key='DIRECTORY', val=str(self.directory)) # if directory doesnt exist, prompt user for folder if not Path(self.directory).is_dir(): messagebox.showinfo( 'Geofinder Folder not found', 'Choose Folder for GeoFinder data in next dialog') self.directory = filedialog.askdirectory( initialdir=self.home_path, title="Choose Folder for GeoFinder data") if len(self.directory) == 0: sys.exit() else: self.ini_add_section('PATH') self.ini_set(section='PATH', key='DIRECTORY', val=str(self.directory)) return self.directory
def __init__(self, frame, title, dir_name, cache_filename, error): self.logger = logging.getLogger(__name__) self.file_error = True self.title = title self.frame = frame self.separator = ":" self.dirty_flag = False # Flag to track if data was modified self.error = error # Load in list from cache file self.directory = dir_name self.cache_dir = GeoKeys.get_cache_directory(dir_name) self.logger.debug(f'SetupStatusFrame dir={dir_name} sub_dir={self.cache_dir} file={cache_filename}') self.cache = CachedDictionary.CachedDictionary(self.cache_dir, cache_filename) self.cache.read() self.error_dict = {} # Keep a dictionary of errors self.supported_countries_cd = CachedDictionary.CachedDictionary(self.cache_dir, "country_list.pkl") self.supported_countries_cd.read() self.supported_countries_dct: Dict[str, str] = self.supported_countries_cd.dict self.logger.debug(f'country list len={len(self.supported_countries_dct)}') self.grd = {"title_label": [0, 0, 5, 5, "W"], "scrollbar": [1, 2, 0, 5, "WNS"], "status": [0, 1, 5, 5, "W"], "add_button": [2, 4, 5, 5, "W"], "listbox": [0, 2, 5, 5, "E"], "unused": [2, 3, 5, 5, "W"], "add_entry": [0, 4, 5, 5, "W"], "load_button": [2, 1, 5, 5, "W"], "geoname_button": [2, 1, 5, 5, "E"], "add_label": [0, 3, 5, 5, "EW"]} self.title_label = Widge.CLabel(frame, text=self.title, width=80, style='Info.TLabel') self.status = Widge.CLabel(frame, text=" ", width=80, style='Highlight.TLabel') self.scrollbar = Scrollbar(frame) self.listbox = Listbox(frame, width=80, height=20, bg=AppStyle.LT_GRAY, selectmode=MULTIPLE, yscrollcommand=self.scrollbar.set) self.add_button = ttk.Button(frame, text="geonames.org", command=self.web_handler, width=12) # Configure buttons and widgets self.configure_widgets() #self.frame.columnconfigure(0, weight=5) #self.frame.columnconfigure(2, weight=2) #self.frame.rowconfigure(0, weight=2) #self.frame.rowconfigure(1, weight=2) # Display data self.load_handler()
def read(self) -> bool: """ Read in list of country names and ISO codes """ if self.progress is not None: self.progress.update_progress(100, "Read ISO countries...") # list of all countries and their ISO codes # This also includes some common aliases self.geodb.db.begin() self.logger.debug(self.lang_list) # Add country names to DB for ky, row in country_dict.items(): # Localize country names to specified for lang in self.lang_list: # If we have a translation table for this language, then apply it if trans_table.get(lang): tbl = trans_table.get(lang) # Look up the country translation if tbl.get(ky): ky = tbl.get(ky) break # Apply first translation in list # Create Geo_row # ('paris', 'fr', '07', '012', '12.345', '45.123', 'PPL') geo_row = [None] * GeoDB.Entry.MAX geo_row[GeoDB.Entry.NAME] = GeoKeys.normalize(ky) sdx = phonetics.dmetaphone(geo_row[GeoDB.Entry.NAME]) geo_row[GeoDB.Entry.SDX] = sdx[0] geo_row[GeoDB.Entry.ISO] = row[CnRow.ISO].lower() geo_row[GeoDB.Entry.ADM1] = '' geo_row[GeoDB.Entry.ADM2] = '' geo_row[GeoDB.Entry.LAT] = row[CnRow.LAT] geo_row[GeoDB.Entry.LON] = row[CnRow.LON] geo_row[GeoDB.Entry.FEAT] = 'ADM0' geo_row[GeoDB.Entry.ID] = row[CnRow.ISO].lower() self.geodb.insert(geo_row=geo_row, feat_code='ADM0') self.geodb.db.commit() return False
def get_five_part_title(self): # Returns a five part title string and tokenized version: # prefix,city,county,state,country # Force type to City to generate four part title (then we add prefix for five parts) save_type = self.place_type self.place_type = PlaceType.CITY # Normalize country name save_country = self.country_name self.country_name, modified = GeoKeys.country_normalize( self.country_name) if len(self.extra) > 0: full_title = self.prefix + ' ' + self.extra + ',' + self.format_full_nm( None) else: full_title = self.prefix + ',' + self.format_full_nm(None) # Restore values to original self.place_type = save_type self.country_name = save_country return full_title
def write_updated_place(self, place: Loc.Loc, entry): # Write out updated location and lat/lon to file self.geodata.geo_files.geodb.set_display_names(place) place.original_entry = place.format_full_nm( self.geodata.geo_files.output_replace_dct) prefix = GeoKeys.capwords(self.place.prefix) if self.diagnostics: self.in_diag_file.write(f'{entry}\n') if place.result_type != GeoKeys.Result.DELETE: # self.logger.debug(f'Write Updated - name={place.name} pref=[{place.prefix}]') self.ancestry_file_handler.write_updated( prefix + place.prefix_commas + place.original_entry, place) self.ancestry_file_handler.write_lat_lon(lat=place.lat, lon=place.lon) text = prefix + place.prefix_commas + place.original_entry + '\n' text = str(text.encode('utf-8', errors='replace')) self.out_diag_file.write(text) else: # self.logger.debug('zero len, no output') if self.diagnostics: self.out_diag_file.write('DELETE\n') pass
def read_geoname(self) -> bool: # Read Geoname DB file - this is the db of geoname.org city files and is stored in cache directory under geonames_data # The db only contains important fields and only for supported countries # This file is much smaller and faster to read than the geoname files # If the db doesn't exist, read the geonames.org files and build it. # the UtilMain.py allows user changes to config parameters and then requires rebuild of db # if the user loads a new geonames.org file, we also need to rebuild the db # Use db if it exists and is newer than the geonames directory cache_dir = GeoKeys.get_cache_directory(self.directory) db_path = os.path.join(cache_dir, 'geodata.db') self.logger.debug(f'path for geodata.db: {db_path}') err_msg = '' # Validate Database setup if os.path.exists(db_path): # See if db is fresh (newer than other files) self.logger.debug(f'DB found at {db_path}') self.geodb = GeoDB.GeoDB(db_path=db_path, version=self.required_db_version) # Make sure DB is correct version ver = self.geodb.get_db_version() if ver != self.required_db_version: err_msg = f'Database version will be upgraded:\n\n{self.db_upgrade_text}\n\n' \ f'Upgrading database from V{ver} to V{self.required_db_version}.' else: # Correct Version. Make sure DB is not stale dir_time = os.path.getmtime(self.directory) cache_time = os.path.getmtime(db_path) #if cache_time > dir_time: if True: self.logger.info(f'DB is up to date') # Ensure DB has reasonable number of records count = self.geodb.get_row_count() self.logger.info(f'Geoname entries = {count:,}') if count < 1000: # Error if DB has under 1000 records err_msg = f'Geoname Database is too small.\n\n {db_path}\n\nRebuilding DB ' if False: # DB is stale err_msg = f'DB {db_path} is older than geonames.org files. Rebuilding DB ' if not messagebox.askyesno( 'Stale Database', 'Database is older than geonames.org files.\n\nRebuild database?' ): err_msg = '' else: err_msg = f'Database not found at\n\n{db_path}.\n\nBuilding DB' self.logger.debug(f'{err_msg}') if err_msg == '': # No DB errors detected self.geodb.create_indices() self.geodb.create_geoid_index() return False # DB error detected - rebuild database self.logger.debug('message box') messagebox.showinfo('Database Error', err_msg) self.logger.debug('message box done') # DB error. Rebuild it from geoname files self.logger.debug(err_msg) if os.path.exists(db_path): self.geodb.close() os.remove(db_path) self.logger.debug('Database deleted') self.geodb = GeoDB.GeoDB(db_path=db_path, version=self.required_db_version) self.country = Country.Country(self.progress_bar, geodb=self.geodb, lang_list=self.lang_list) # walk thru list of files ending in .txt e.g US.txt, FR.txt, all_countries.txt, etc file_count = 0 # Put in country data self.country.read() start_time = time.time() # Set DB version as -1 for incomplete self.geodb.insert_version(-1) # Put in geonames file data for fname in [ 'allCountries.txt', 'ca.txt', 'gb.txt', 'de.txt', 'fr.txt', 'nl.txt' ]: # Read all geoname files error = self.read_geoname_file( fname) # Read in info (lat/long) for all places from if error: self.logger.error(f'Error reading geoname file {fname}') else: file_count += 1 if file_count == 0: self.logger.error( f'No geonames files found in {os.path.join(self.directory, "*.txt")}' ) return True # Put in alias names self.logger.info( f'geonames files done. Elapsed ={time.time() - start_time}') start_time = time.time() self.alternate_names.read() self.logger.info( f'Alternate names done. Elapsed ={time.time() - start_time}') self.logger.info(f'Geonames entries = {self.geodb.get_row_count():,}') start_time = time.time() self.progress("3) Final Step: Creating Indices for Database...", 95) self.geodb.create_geoid_index() self.geodb.create_indices() self.logger.debug( f'Indices done. Elapsed ={time.time() - start_time}') self.geodb.insert_version(self.required_db_version) return False
def read_geoname_file(self, file) -> bool: # , g_dict """Read in geonames files and build lookup structure Read a geoname.org places file and create a db of all the places. 1. The db contains: Name, Lat, Long, district1ID (State or Province ID), district2_id, feat_code 2. Since Geonames supports over 25M entries, the db is filtered to only the countries and feature types we want """ Geofile_row = namedtuple( 'Geofile_row', 'id name name_asc alt lat lon feat_class feat_code iso iso2 admin1_id' ' admin2_id admin3_id admin4_id pop elev dem timezone mod') self.line_num = 0 self.progress("Reading {}...".format(file), 0) path = os.path.join(self.directory, file) if os.path.exists(path): fsize = os.path.getsize(path) bytes_per_line = 128 with open(path, 'r', newline="", encoding='utf-8', errors='replace') as geofile: self.progress("Building Database from {}".format(file), 2) # initialize progress bar reader = csv.reader(geofile, delimiter='\t') self.geodb.db.begin() # Map line from csv reader into GeonameData namedtuple for line in reader: self.line_num += 1 if self.line_num % 20000 == 0: # Periodically update progress prog = self.line_num * bytes_per_line * 100 / fsize self.progress( msg= f"1) Building Database from {file} {prog:.1f}%", val=prog) try: geoname_row = Geofile_row._make(line) except TypeError: self.logger.error( f'Unable to parse geoname location info in {file} line {self.line_num}' ) continue # Only handle line if it's for a country we follow and its # for a Feature tag we're interested in if geoname_row.iso.lower() in self.supported_countries_dct and \ geoname_row.feat_code in self.feature_code_list_dct: self.insert_georow(geoname_row) if geoname_row.name.lower() != GeoKeys.normalize( geoname_row.name): self.geodb.insert_alternate_name( geoname_row.name, geoname_row.id, 'ut8') if self.progress_bar is not None: if self.progress_bar.shutdown_requested: # Abort DB build. Clear out partial DB self.geodb.clear_geoname_data() self.progress("Write Database", 90) self.geodb.db.commit() self.progress("Database created", 100) return False else: return True
def parse_place(self, place_name: str, geo_files): """ Given a comma separated place name, parse into its city, AdminID, country_iso and type of entity (city, country etc) Expected format: prefix,city,admin2,admin1,country self.status has Result status code """ self.clear() self.original_entry = place_name # Convert open-brace and open-paren to comma. close brace/paren will be stripped by normalize() res = re.sub('\[', ',', place_name) res = re.sub('\(', ',', res) tokens = res.split(",") token_count = len(tokens) self.place_type = PlaceType.CITY # Parse City, Admin2, Admin2, Country scanning from the right. When there are more tokens, we capture more fields # Place type is the leftmost item we found - either City, Admin2, Admin2, or Country # self.logger.debug(f'***** PLACE [{place_name}] *****') if '--' in place_name: # Pull out filter flags if present self.logger.debug('filter') self.filter(place_name, geo_files) return elif token_count > 0: # COUNTRY - right-most token should be country # Format: Country self.place_type = PlaceType.COUNTRY self.country_name = GeoKeys.search_normalize(tokens[-1], "") self.target = self.country_name # Validate country self.country_iso = geo_files.geodb.get_country_iso( self) # Get Country country_iso if self.country_iso != '': # self.logger.debug(f'Found country. iso = [{self.country_iso}]') pass else: # Last token is not COUNTRY. # Append blank to token list so we now have xx,admin1, blank_country tokens.append('') token_count = len(tokens) self.result_type = GeoKeys.Result.NO_COUNTRY self.country_iso = '' self.country_name = '' if token_count > 1: # Format: Admin1, Country. # Admin1 is 2nd to last token self.admin1_name = GeoKeys.search_normalize( tokens[-2], self.country_iso) self.admin1_name = GeoKeys.admin1_normalize( self.admin1_name, self.country_iso) if len(self.admin1_name) > 0: self.place_type = PlaceType.ADMIN1 self.target = self.admin1_name # Lookup Admin1 geo_files.geodb.get_admin1_id(self) if self.admin1_id != '': # self.logger.debug(f'Found admin1 {self.admin1_name}') pass else: # Last token is not Admin1 - append blank self.admin1_name = '' # Append blank token for admin1 position tokens.append('') token_count = len(tokens) if token_count == 3 and self.admin1_name == '' and self.country_name == '': # Just one valid token, so take as city self.city1 = GeoKeys.search_normalize(tokens[-3], self.country_iso) if len(self.city1) > 0: self.place_type = PlaceType.CITY self.target = self.city1 elif token_count > 2: # Format: Admin2, Admin1, Country # Admin2 is 3rd to last. Note - if Admin2 isnt found, it will look it up as city self.admin2_name = GeoKeys.search_normalize( tokens[-3], self.country_iso) self.admin2_name, modif = GeoKeys.admin2_normalize( self.admin2_name, self.country_iso) if len(self.admin2_name) > 0: self.place_type = PlaceType.ADMIN2 self.target = self.admin2_name if token_count > 3: # Format: Prefix, City, Admin2, Admin1, Country # City is 4th to last token # Other tokens go into Prefix self.city1 = GeoKeys.search_normalize(tokens[-4], self.country_iso) if len(self.city1) > 0: self.place_type = PlaceType.CITY self.target = self.city1 # Assign remaining tokens (if any) to prefix. Zero'th token to 4th from end. for item in tokens[0:-4]: if len(self.prefix) > 0: self.prefix += ' ' self.prefix += str(item.strip(' ')) # Special case for New York, New York which normally refers to the City, not county if self.admin2_name == 'new york' and self.place_type == PlaceType.ADMIN2: self.admin2_name = 'new york city' self.target = self.admin2_name self.prefix = self.prefix.strip(',') self.logger.debug( f" ======= PARSE: {place_name} City [{self.city1}] Adm2 [{self.admin2_name}]" f" Adm1 [{self.admin1_name}] adm1_id [{self.admin1_id}] Cntry [{self.country_name}] Pref=[{self.prefix}]" f" type_id={self.place_type}") return
def __init__(self, directory: str, progress_bar): self.logger = logging.getLogger(__name__) self.geodb = None self.required_db_version = 2 self.db_upgrade_text = 'Adding support for non-English output' self.directory: str = directory self.progress_bar = progress_bar self.line_num = 0 self.cache_changed: bool = False sub_dir = GeoKeys.get_cache_directory(self.directory) self.country = None # Read in dictionary listing Geoname features we should include self.feature_code_list_cd = CachedDictionary.CachedDictionary( sub_dir, "feature_list.pkl") self.feature_code_list_cd.read() self.feature_code_list_dct: Dict[str, str] = self.feature_code_list_cd.dict if len(self.feature_code_list_dct) < 3: self.logger.warning('Feature list is empty. Setting defaults') self.feature_code_list_dct.clear() feature_list = UtilFeatureFrame.default for feat in feature_list: self.feature_code_list_dct[feat] = '' self.feature_code_list_cd.write() # Read in dictionary listing countries (ISO2) we should include self.supported_countries_cd = CachedDictionary.CachedDictionary( sub_dir, "country_list.pkl") self.supported_countries_cd.read() self.supported_countries_dct: Dict[ str, str] = self.supported_countries_cd.dict # Read in dictionary listing languages (ISO2) we should include self.languages_list_cd = CachedDictionary.CachedDictionary( sub_dir, "languages_list.pkl") self.languages_list_cd.read() self.languages_list_dct: Dict[str, str] = self.languages_list_cd.dict self.lang_list = [] for item in self.languages_list_dct: self.lang_list.append(item) # Read in dictionary listing output text replacements self.output_replace_cd = CachedDictionary.CachedDictionary( sub_dir, "output_list.pkl") self.output_replace_cd.read() self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict self.output_replace_list = [] for item in self.output_replace_dct: self.output_replace_list.append(item) self.logger.debug(f'Output replace [{item}]') self.entry_place = Loc.Loc() # Support for Geonames AlternateNames file. Adds alternate names for entries self.alternate_names = AlternateNames.AlternateNames( directory_name=self.directory, geo_files=self, progress_bar=self.progress_bar, filename='alternateNamesV2.txt', lang_list=self.lang_list)
def match_score(self, inp_place: Loc.Loc, res_place: Loc.Loc) -> int: """ :param inp_place: Input place structure with users text :param res_place: Result place structure with DB result :return: score 0-100 reflecting the difference between the user input and the result. 0 is perfect match, 100 is no match Score is also adjusted based on Feature type. More important features (large city) get lower result """ inp_len = [0] * 5 num_inp_tokens = 0.0 in_score = 0 # Create full place title (prefix,city,county,state,country) from input place. inp_title = inp_place.get_five_part_title() inp_title = GeoKeys.normalize_match_title(inp_title, inp_place.country_iso) inp_tokens = inp_title.split(',') # Create full place title (prefix,city,county,state,country) from result place res_place.prefix = ' ' res_title = res_place.get_five_part_title() res_title = GeoKeys.normalize_match_title(res_title, res_place.country_iso) res_tokens = res_title.split(',') # Store length of original input tokens. This is used for percent unmatched calculation for it, tk in enumerate(inp_tokens): inp_tokens[it] = inp_tokens[it].strip(' ') inp_len[it] = len(inp_tokens[it]) # Create a list of all the words in result and save result len for percent calc res_word_list = ', '.join(map(str, res_tokens)) orig_res_len = len(res_word_list) # Create a list of all the words in input input_words = ', '.join(map(str, inp_tokens)) # Remove any matching sequences in input list and result res_word_list, input_words = self.remove_matching_sequences( res_word_list, input_words) # For each input token calculate percent of new (unmatched) size vs original size unmatched_input_tokens = input_words.split(',') # Each token in place hierarchy gets a different weighting # Prefix, city,county, state, country score_diags = '' # Calculate percent of USER INPUT text that was unmatched, then apply weighting for idx, tk in enumerate(inp_tokens): if inp_len[idx] > 0: unmatched_percent = int( 100.0 * len(unmatched_input_tokens[idx].strip(' ')) / inp_len[idx]) in_score += unmatched_percent * self.weight[idx] score_diags += f' {idx}) [{tk}]{inp_len[idx]} {unmatched_percent}% * {self.weight[idx]} ' # self.logger.debug(f'{idx}) Rem=[{unmatched_input_tokens[idx].strip(" " )}] wgtd={unmatched_percent * self.weight[idx]}') num_inp_tokens += 1.0 * self.weight[idx] # self.logger.debug(f'{idx} [{inp_tokens2[idx]}:{inp_tokens[idx]}] rawscr={sc}% orig_len={inp_len[idx]} wgt={self.weight[idx]}') if idx < 2: # If the full first or second token of the result is in input then improve score # Bonus for a full match as against above partial matches if res_tokens[idx] in inp_tokens[idx]: in_score -= self.first_token_match_bonus # Average over number of tokens (with fractional weight). Gives 0-100% regardless of weighting and number of tokens in_score = in_score / num_inp_tokens # self.logger.debug(f'raw in={in_score} numtkn={num_inp_tokens}') # Calculate percent of DB RESULT text that was unmatched if orig_res_len > 0: out_score = int(100.0 * len(res_word_list.strip(' ')) / orig_res_len) # self.logger.debug(f"Out=[{res_word_list.strip(' ')}] orig_len={orig_res_len}") else: out_score = 0 if not inp_place.standard_parse: # If Tokens were not in hierarchical order, give penalty parse_penalty = self.wrong_order_penalty else: parse_penalty = 0.0 if '*' in inp_place.original_entry: # if it was a wildcard search it's hard to rank - add a penalty wildcard_penalty = self.wildcard_penalty else: wildcard_penalty = 0.0 # Feature score is to ensure "important" places get higher rank (large city, etc) feature_score = Geodata.Geodata.get_priority(res_place.feature) # Add up scores - Each item is 0-100 and weighed as below in_weight = 1.0 - self.out_weight - self.feature_weight score = in_score * in_weight + out_score * self.out_weight + feature_score * self.feature_weight + parse_penalty + wildcard_penalty # self.logger.debug(f'SCORE {score:.1f} [{res_title}] out={out_score * out_weight:.1f} ' # f'in={in_score:.1f} feat={feature_score * feature_weight:.1f} parse={parse_penalty}\n {score_diags}') return score
def handle_place_entry(self): """ Get next PLACE in users File. Replace it, skip it, or have user correct it. """ self.w.original_entry.set_text("") if self.w.prog.shutdown_requested: self.periodic_update("Shutting down...") else: self.periodic_update("Scanning") self.clear_detail_text(self.place) while True: self.err_count += 1 # Counter is used to periodically update # Update statistics done = self.update_statistics() if self.ancestry_file_handler.place_total > 0: self.w.prog.update_progress( 100 * done / self.ancestry_file_handler.place_total, " ") else: self.w.prog.update_progress(0, " ") # Find the next PLACE entry in file # Process it and keep looping until we need user input self.place.clear() town_entry, eof, rec_id = self.ancestry_file_handler.get_next_place( ) town_entry = GeoKeys.semi_normalize(town_entry) self.place.id = rec_id if eof: self.end_of_file_shutdown() # See if we already have a fix (Global Replace) or Skip (ignore). # Otherwise have user handle it replacement_geoid = self.get_replacement(self.global_replace, town_entry, self.place) if replacement_geoid is not None: # IN GLOBAL REPLACE LIST - There is a global change that we can apply to this line. self.matched_count += 1 if self.place.result_type == GeoKeys.Result.STRONG_MATCH: # Output updated place to ancestry file self.write_updated_place(self.place, town_entry) if self.place.prefix != '': pass # self.logger.debug(f'write upd prefix= [{self.place.original_entry}]') # Display status to user if self.w.prog.shutdown_requested: self.periodic_update("Creating Import...") else: self.periodic_update("Applying change") elif self.place.result_type == GeoKeys.Result.DELETE: continue else: self.logger.warning( f'***ERROR looking up GEOID=[{replacement_geoid}] for [{town_entry}] ' ) self.place.event_year = int( self.ancestry_file_handler.event_year ) # Set place date to event date (geo names change over time) self.w.original_entry.set_text( f'** DATABASE ERROR FOR GEOID=[{replacement_geoid}] for [{town_entry}]' ) self.w.user_entry.set_text(f'{town_entry}') self.geodata.find_location(town_entry, self.place, self.w.prog.shutdown_requested) break continue elif self.skiplist.get(town_entry) is not None: # IN SKIPLIST - Write out as-is and go to next error self.skip_count += 1 self.periodic_update("Skipping") self.ancestry_file_handler.write_asis(town_entry) continue else: # Found a PLACE entry that we don't have a global replace or skip for # See if it is in our place database self.place.event_year = int( self.ancestry_file_handler.event_year ) # Set place date to event date (geo names change over time) self.geodata.find_location(town_entry, self.place, self.w.prog.shutdown_requested) # if self.place.result_type not in GeoKeys.successful_match: # self.geodata.set_last_iso('') # self.geodata.find_location(town_entry, self.place) if self.place.result_type in GeoKeys.successful_match: # FOUND A MATCH if self.place.result_type == GeoKeys.Result.STRONG_MATCH: # Strong match self.matched_count += 1 # Write out line without user verification if self.w.prog.shutdown_requested: self.periodic_update("Creating Import...") else: self.periodic_update("Scanning") # Add to global replace list - Use '@' for tokenizing. Save GEOID_TOKEN and PREFIX_TOKEN res = '@' + self.place.geoid + '@' + self.place.prefix self.global_replace.set(town_entry, res) self.logger.debug( f'Found Strong Match for {town_entry} res= [{res}] Setting DICT' ) # Periodically flush dictionary to disk. (We flush on exit as well) if self.err_count % 100 == 1: self.global_replace.write() self.write_updated_place(self.place, town_entry) continue else: # Found match, but not a Strong Match if self.w.prog.shutdown_requested: # User requested shutdown. Write this item out as-is self.review_count += 1 self.periodic_update("Creating Import...") self.w.original_entry.set_text(" ") self.ancestry_file_handler.write_asis(town_entry) continue else: # Have user review the match self.logger.debug( f'user review for {town_entry} res= [{self.place.result_type}] ' ) self.w.status.configure(style="Good.TLabel") self.w.original_entry.set_text( self.place.original_entry) # Display place self.w.user_entry.set_text( self.place.original_entry) # Display place break else: # No match if self.w.prog.shutdown_requested: # User requested shutdown. Write this item out as-is self.review_count += 1 self.periodic_update("Creating Import...") self.w.original_entry.set_text(" ") self.ancestry_file_handler.write_asis(town_entry) continue else: # Have user review match # self.logger.debug(f'User2 review for {town_entry}. status ={self.place.status}') self.w.status.configure(style="Good.TLabel") self.w.original_entry.set_text( self.place.original_entry) # Display place self.w.user_entry.set_text( self.place.original_entry) # Display place # Have user review the match break # Have user review the result self.display_result(self.place)
def __init__(self): print('GeoFinder v{}'.format(__version__.__version__)) print('Python {}.{}'.format(sys.version_info[0], sys.version_info[1])) if sys.version_info < (3, 6, 0): raise Exception("GeoFinder Requires Python 3.6 or higher.") val = '' print(f'GeoFinder Requires Python 3.6 or higher {val}') self.save_enabled = False # Only allow SAVE when we have an item that was matched in geonames self.user_selected_list = False # Indicates whether user selected a list entry or text edit entry self.err_count = 0 self.matched_count = 0 self.review_count = 0 self.skip_count = 0 self.odd = False self.ancestry_file_handler = None self.place = None self.skiplist = None self.global_replace = None self.geodata = None self.out_suffix = 'unknown_suffix' self.out_diag_file = None self.in_diag_file = None # initiate the parser parser = argparse.ArgumentParser() parser.add_argument("--logging", help="Enable quiet logging") parser.add_argument("--diagnostics", help="Create diagnostics files") # read arguments from the command line args = parser.parse_args() # check for --verbose switch if args.logging == 'info': self.logger = self.setup_logging_info('geofinder Init') self.logger.info(f"--logging set to INFO logging {args.logging}") else: self.logger = self.setup_logging('geofinder Init') # check for --diagnostics switch if args.diagnostics: self.logger.info(f"--diagnostics files enabled {args.diagnostics}") self.diagnostics = True else: self.diagnostics = False # Create App window and configure window buttons and widgets self.w: AppLayout.AppLayout = AppLayout.AppLayout(self) self.w.create_initialization_widgets() self.w.config_button.config(state="normal") # Get our base directory path from INI file. Create INI if it doesnt exist home_path = str(Path.home()) self.directory = Path( os.path.join(home_path, GeoKeys.get_directory_name())) self.ini_handler = IniHandler(home_path=home_path, ini_name='geofinder.ini') self.directory = self.ini_handler.get_directory_from_ini() self.cache_dir = GeoKeys.get_cache_directory(self.directory) self.logger.info(f'Cache directory {self.cache_dir}') # Set up configuration class self.cfg = Config.Config(self.directory) self.util = UtilLayout.UtilLayout(root=self.w.root, directory=self.directory, cache_dir=self.cache_dir) if not os.path.exists(self.cache_dir): # Create directories for GeoFinder if messagebox.askyesno( 'Geoname Data Cache Folder not found', f'Create Geoname Cache folder?\n\n{self.cache_dir} '): err = self.cfg.create_directories() if not os.path.exists(self.cache_dir): messagebox.showwarning( 'Geoname Data Cache Folder not found', f'Unable to create folder\n\n{self.cache_dir} ') self.shutdown() else: self.logger.debug(f'Created {self.cache_dir}') messagebox.showinfo( 'Geoname Data Cache Folder created', f'Created folder\n\n{self.cache_dir} ') else: self.shutdown() # Ensure GeoFinder directory structure is valid if self.cfg.valid_directories(): # Directories are valid. See if required Geonames files are present err = self.check_configuration() if err: # Missing files self.logger.warning('Missing files') self.w.status.set_text("Click Config to set up Geo Finder") TKHelper.set_preferred_button(self.w.config_button, self.w.initialization_buttons, "Preferred.TButton") self.w.load_button.config(state="disabled") else: # No config errors # Read config settings (Ancestry file path) err = self.cfg.read() if err: self.logger.warning('error reading {} config.pkl'.format( self.cache_dir)) self.w.original_entry.set_text(self.cfg.get("gedcom_path")) TKHelper.enable_buttons(self.w.initialization_buttons) if os.path.exists(self.cfg.get("gedcom_path")): # file is valid. Prompt user to click Open for file self.w.status.set_text( f"Click Open to load {file_types} file") TKHelper.set_preferred_button( self.w.load_button, self.w.initialization_buttons, "Preferred.TButton") else: # No file. prompt user to select a file - GEDCOM file name isn't valid self.w.status.set_text(f"Choose a {file_types} file") self.w.load_button.config(state="disabled") TKHelper.set_preferred_button( self.w.choose_button, self.w.initialization_buttons, "Preferred.TButton") else: # Missing directories self.logger.warning('Directories not found: {} '.format( self.cache_dir)) self.w.status.set_text("Click Config to set up Geo Finder") self.w.load_button.config(state="disabled") TKHelper.set_preferred_button(self.w.config_button, self.w.initialization_buttons, "Preferred.TButton") # Flag to indicate whether we are in startup or in Window loop. Determines how window idle is called self.startup = False self.w.root.mainloop( ) # ENTER MAIN LOOP and Wait for user to click on load button