コード例 #1
0
ファイル: UtilMain.py プロジェクト: prculley/GeoFinder
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        fmt = "%(levelname)s %(asctime)s %(name)s.%(funcName)s %(lineno)d: %(message)s"
        logging.basicConfig(level=logging.DEBUG, format=fmt)
        self.logger.info('Configuration')

        self.directory: str = os.path.join(str(Path.home()), GeoKeys.get_directory_name())
        self.cache_dir = GeoKeys.get_cache_directory()

        # Get configuration settings stored in config pickle file
        self.cfg: CachedDictionary.CachedDictionary = CachedDictionary.CachedDictionary(self.cache_dir, "config.pkl")

        if not os.path.exists(self.directory):
            self.logger.info(f'Creating main folder {self.directory}')
            os.makedirs(self.directory)

        if not os.path.exists(self.cache_dir):
            self.logger.info(f'Creating cache folder {self.cache_dir}')
            os.makedirs(self.cache_dir)

        self.cfg.read()

        # Verify config -  test to see if gedcom file accessible
        self.get_config()

        # Create App window
        self.root = Tk()
        self.root["padx"] = 30
        self.root["pady"] = 30
        self.root.title('GeoUtil')

        UtilLayout.UtilLayout(root=self.root, directory=self.directory, cache_dir=self.cache_dir)
コード例 #2
0
    def insert_georow(self, geoname_row):
        # Create Geo_row and inses
        # ('paris', 'fr', '07', '012', 12.345, 45.123, 'PPL', '34124')
        geo_row = [None] * GeoDB.Entry.MAX
        geo_row[GeoDB.Entry.NAME] = GeoKeys.normalize(geoname_row.name)
        geo_row[GeoDB.Entry.SDX] = GeoKeys.get_soundex(
            geo_row[GeoDB.Entry.NAME])

        geo_row[GeoDB.Entry.ISO] = geoname_row.iso.lower()
        geo_row[GeoDB.Entry.ADM1] = geoname_row.admin1_id
        geo_row[GeoDB.Entry.ADM2] = geoname_row.admin2_id
        geo_row[GeoDB.Entry.LAT] = geoname_row.lat
        geo_row[GeoDB.Entry.LON] = geoname_row.lon
        geo_row[GeoDB.Entry.FEAT] = geoname_row.feat_code
        geo_row[GeoDB.Entry.ID] = geoname_row.id

        if int(geoname_row.pop) > 1000000 and 'PP' in geoname_row.feat_code:
            geo_row[GeoDB.Entry.FEAT] = 'PP1M'
        elif int(geoname_row.pop) > 100000 and 'PP' in geoname_row.feat_code:
            geo_row[GeoDB.Entry.FEAT] = 'P1HK'
        elif int(geoname_row.pop) < 10000 and 'PP' in geoname_row.feat_code:
            geo_row[GeoDB.Entry.FEAT] = 'PPLL'

        #if geoname_row.feat_code == 'PPLQ':
        #    geo_row[GeoDB.Entry.NAME] = re.sub(r' historical', '', geo_row[GeoDB.Entry.NAME])

        self.geodb.insert(geo_row=geo_row, feat_code=geoname_row.feat_code)

        # Also add abbreviations for USA states
        if geo_row[
                GeoDB.Entry.ISO] == 'us' and geoname_row.feat_code == 'ADM1':
            geo_row[GeoDB.Entry.NAME] = geo_row[GeoDB.Entry.ADM1].lower()
            self.geodb.insert(geo_row=geo_row, feat_code=geoname_row.feat_code)
コード例 #3
0
ファイル: AlternateNames.py プロジェクト: prculley/GeoFinder
 def __init__(self, directory_name: str, filename: str, progress_bar,
              geo_files: GeodataFiles, lang_list):
     super().__init__(directory_name, filename, progress_bar)
     self.sub_dir = GeoKeys.get_cache_directory(directory_name)
     self.geo_files: GeodataFiles.GeodataFiles = geo_files
     self.lang_list = lang_list
     self.loc = Loc()
コード例 #4
0
ファイル: GeoDB.py プロジェクト: prculley/GeoFinder
    def get_country_iso(self, place: Loc) -> str:
        """ Return ISO code for specified country"""
        lookup_target, modified = GeoKeys.country_normalize(place.country_name)
        if len(lookup_target) == 0:
            return ''

        # Try each query until we find a match - each query gets less exact
        query_list = [
            Query(where="name = ? AND f_code = ? ",
                  args=(lookup_target, 'ADM0'),
                  result=Result.STRONG_MATCH),
            # Query(where="name LIKE ?  AND f_code = ? ",
            #      args=(self.create_wildcard(lookup_target), 'ADM0'),
            #      result=Result.PARTIAL_MATCH)  #,
            # Query(where="sdx = ?  AND f_code = ? ",
            #      args=(GeoKeys.get_soundex (lookup_target), 'ADM0'),
            #      result=Result.PARTIAL_MATCH)
        ]

        row_list, result_code = self.db.process_query_list(
            from_tbl='main.admin', query_list=query_list)

        if len(row_list) > 0:
            res = row_list[0][Entry.ISO]
            if len(row_list) == 1:
                place.country_name = row_list[0][Entry.NAME]
        else:
            res = ''

        return res
コード例 #5
0
ファイル: Loc.py プロジェクト: prculley/GeoFinder
    def format_full_nm(self, replace_dct):
        """ Take the parts of a Place and build fullname.  e.g. pref, city,adm2,adm1,country name """
        self.set_place_type()

        if self.admin1_name is None:
            self.admin1_name = ''
        if self.admin2_name is None:
            self.admin2_name = ''

        if self.place_type == PlaceType.COUNTRY:
            nm = f"{self.country_name}"
        elif self.place_type == PlaceType.ADMIN1:
            nm = f"{self.admin1_name}, {self.country_name}"
        elif self.place_type == PlaceType.ADMIN2:
            nm = f"{self.admin2_name}, {self.admin1_name}, {self.country_name}"
        else:
            nm = f"{self.city1}, {self.admin2_name}, {self.admin1_name}, {str(self.country_name)}"

        if self.prefix in nm:
            self.prefix = ''

        if len(self.prefix) > 0:
            self.prefix_commas = ', '
        else:
            self.prefix_commas = ''

        nm = GeoKeys.capwords(nm)

        # Perform any text replacements user entered into Output Tab
        if replace_dct:
            for key in replace_dct:
                nm = re.sub(key, replace_dct[key], nm)

        return nm
コード例 #6
0
ファイル: Loc.py プロジェクト: prculley/GeoFinder
    def filter(self, place_name, geo_files):
        # Advanced search parameters
        # Separate out arguments
        tokens = place_name.split(",")
        args = []
        for tkn in tokens:
            if '--' in tkn:
                args.append(tkn.strip(' '))

        # Parse options in place name
        parser = ArgumentParserNoExit(description="Parses command.")
        parser.add_argument("-f", "--feature", help=argparse.SUPPRESS)
        parser.add_argument("-i", "--iso", help=argparse.SUPPRESS)
        parser.add_argument("-c", "--country", help=argparse.SUPPRESS)
        try:
            options = parser.parse_args(args)
            self.city1 = GeoKeys.search_normalize(tokens[0], self.country_iso)
            self.target = self.city1
            if options.iso:
                self.country_iso = options.iso.lower()
            if options.country:
                self.country_iso = options.country.lower()
            if options.feature:
                self.feature = options.feature.upper()
            self.place_type = PlaceType.ADVANCED_SEARCH
        except Exception as e:
            self.logger.debug(e)
        self.logger.debug(
            f'ADV SEARCH: targ={self.city1} iso={self.country_iso} feat={self.feature} typ={self.place_type}'
        )
コード例 #7
0
    def __init__(self, directory):
        self.logger = logging.getLogger(__name__)
        self.config_cd: CachedDictionary
        self.config_cd = None

        self.directory: str = directory
        self.cache_dir = GeoKeys.get_cache_directory(self.directory)
コード例 #8
0
ファイル: AlternateNames.py プロジェクト: prculley/GeoFinder
    def handle_line(self, line_num, row):
        # This is called as each line is read
        alt_tokens = row.split('\t')
        if len(alt_tokens) != 10:
            self.logger.debug(
                f'Incorrect number of tokens: {alt_tokens} line {line_num}')
            return

        self.loc.georow_list = []

        # Alternate names are in multiple languages.  Only add if item is in requested lang list
        if alt_tokens[ALT_LANG] in self.lang_list:
            # Add this alias to geoname db if there is already an entry (geoname DB is filtered based on feature)
            # See if item has an entry with same GEOID in Main DB
            dbid = self.geo_files.geodb.geoid_main_dict.get(
                alt_tokens[ALT_GEOID])
            if dbid is not None:
                self.loc.target = dbid
                self.geo_files.geodb.lookup_main_dbid(place=self.loc)
            else:
                # See if item has an  entry with same GEOID in Admin DB
                dbid = self.geo_files.geodb.geoid_admin_dict.get(
                    alt_tokens[ALT_GEOID])
                if dbid is not None:
                    self.loc.target = dbid
                    self.geo_files.geodb.lookup_admin_dbid(place=self.loc)

            if len(self.loc.georow_list) > 0:
                # convert to list  and modify name and add to DB and its soundex
                lst = list(self.loc.georow_list[0])
                del lst[-1]
                lst[GeoDB.Entry.NAME] = GeoKeys.normalize(alt_tokens[ALT_NAME])
                lst.append(GeoKeys.get_soundex(alt_tokens[ALT_NAME]))
                new_row = tuple(lst)  # Convert back to tuple
                if alt_tokens[ALT_LANG] != 'en' or 'ADM' not in lst[
                        GeoDB.Entry.FEAT]:
                    # Only add if not English or not ADM1/ADM2
                    self.geo_files.geodb.insert(
                        geo_row=new_row, feat_code=lst[GeoDB.Entry.FEAT])
                    self.count += 1

                # Add name to altnames table
                if alt_tokens[ALT_LANG] != 'en':
                    self.geo_files.geodb.insert_alternate_name(
                        alt_tokens[ALT_NAME], alt_tokens[ALT_GEOID],
                        alt_tokens[ALT_LANG])
コード例 #9
0
    def update_rowlist_prefix(self, place: Loc.Loc):
        """
        Set all the prefix values in the georow_list
        :param place:
        """
        temp_place = Loc.Loc()
        tokens = place.original_entry.split(',')

        for idx, rw in enumerate(place.georow_list):
            update = list(rw)

            # Put unused fields into prefix
            self.geo_files.geodb.copy_georow_to_place(rw, temp_place)
            temp_place.prefix = ''
            nm = GeoKeys.search_normalize(
                temp_place.format_full_nm(self.geo_files.output_replace_dct),
                place.country_iso)
            # self.logger.debug(f'NAME ={nm}')
            place.prefix = ''

            for num, fld in enumerate(tokens[:2]):
                item = GeoKeys.search_normalize(fld, place.country_iso)
                add_item = False
                # self.logger.debug(f'item={item} ')
                if num == 0 and item not in nm:
                    add_item = True

                if num == 1 and item not in nm and len(tokens) == 2:
                    # We only add the second token if there are only 2 tokens
                    add_item = True

                if '*' in item:
                    # Don't add as prefix if item is a wildcard search
                    add_item = False

                if add_item:
                    if len(place.prefix) > 0:
                        place.prefix += ' '
                    place.prefix += item.title()

            if len(place.prefix) > 0:
                place.prefix_commas = ', '
            update[GeoKeys.Entry.PREFIX] = place.prefix
            # self.logger.debug(f'PREFIX={place.prefix} ')

            place.georow_list[idx] = tuple(update)
コード例 #10
0
ファイル: GrampsCsv.py プロジェクト: prculley/GeoFinder
    def output_row(self, row):
        if len(row[CSVEntry.ENCLOSED_BY]) > 0:
            enc = f'[{row[CSVEntry.ENCLOSED_BY]}]'
        else:
            enc = ''
        if self.csv_path is not None:
            # 0Place (ID), 1Title, 2Name, 3Type, 4latitude, 5longitude, 6enclosed_by
            title = GeoKeys.capwords(row[CSVEntry.TITLE])
            name = GeoKeys.capwords(row[CSVEntry.NAME])

            if math.isnan(float(row[CSVEntry.LAT])) or math.isnan(
                    float(row[CSVEntry.LAT])):
                self.csvfile.write(
                    f'[{row[CSVEntry.PLACE_ID]}],"{title}","{name}",{row[CSVEntry.TYPE]},'
                    f' , ,{enc},\n')
            else:
                self.csvfile.write(
                    f'[{row[CSVEntry.PLACE_ID]}],"{title}","{name}",{row[CSVEntry.TYPE]},'
                    f'{row[CSVEntry.LAT]},{row[CSVEntry.LON]},{enc},\n')
コード例 #11
0
ファイル: IniHandler.py プロジェクト: prculley/GeoFinder
    def get_directory_from_ini(self) -> str:
        if self.ini_path.is_file():
            val = self.ini_read('PATH', 'DIRECTORY')
            if val:
                self.directory = val
            else:
                # Not Found.  Create INI file
                self.directory = Path(
                    os.path.join(str(self.home_path),
                                 GeoKeys.get_directory_name()))
                self.ini_add_section('PATH')
                self.ini_set(section='PATH',
                             key='DIRECTORY',
                             val=str(self.directory))
        else:
            # Not Found.  Create INI file
            self.directory = Path(
                os.path.join(str(self.home_path),
                             GeoKeys.get_directory_name()))
            self.ini_add_section('PATH')
            self.ini_set(section='PATH',
                         key='DIRECTORY',
                         val=str(self.directory))

        #  if directory doesnt exist, prompt user for folder
        if not Path(self.directory).is_dir():
            messagebox.showinfo(
                'Geofinder Folder not found',
                'Choose Folder for GeoFinder data in next dialog')
            self.directory = filedialog.askdirectory(
                initialdir=self.home_path,
                title="Choose Folder for GeoFinder data")
            if len(self.directory) == 0:
                sys.exit()
            else:
                self.ini_add_section('PATH')
                self.ini_set(section='PATH',
                             key='DIRECTORY',
                             val=str(self.directory))
        return self.directory
コード例 #12
0
ファイル: UtilErrorFrame.py プロジェクト: prculley/GeoFinder
    def __init__(self, frame, title, dir_name, cache_filename, error):
        self.logger = logging.getLogger(__name__)
        self.file_error = True
        self.title = title
        self.frame = frame
        self.separator = ":"
        self.dirty_flag = False  # Flag to track if data was modified
        self.error = error

        # Load in list from cache file
        self.directory = dir_name
        self.cache_dir = GeoKeys.get_cache_directory(dir_name)
        self.logger.debug(f'SetupStatusFrame dir={dir_name} sub_dir={self.cache_dir} file={cache_filename}')
        self.cache = CachedDictionary.CachedDictionary(self.cache_dir, cache_filename)
        self.cache.read()
        self.error_dict = {}  # Keep a dictionary of errors

        self.supported_countries_cd = CachedDictionary.CachedDictionary(self.cache_dir, "country_list.pkl")
        self.supported_countries_cd.read()
        self.supported_countries_dct: Dict[str, str] = self.supported_countries_cd.dict

        self.logger.debug(f'country list len={len(self.supported_countries_dct)}')

        self.grd = {"title_label": [0, 0, 5, 5, "W"], "scrollbar": [1, 2, 0, 5, "WNS"], "status": [0, 1, 5, 5, "W"], "add_button": [2, 4, 5, 5, "W"],
                    "listbox": [0, 2, 5, 5, "E"], "unused": [2, 3, 5, 5, "W"], "add_entry": [0, 4, 5, 5, "W"], "load_button": [2, 1, 5, 5, "W"],
                    "geoname_button": [2, 1, 5, 5, "E"], "add_label": [0, 3, 5, 5, "EW"]}

        self.title_label = Widge.CLabel(frame, text=self.title, width=80, style='Info.TLabel')
        self.status = Widge.CLabel(frame, text=" ", width=80, style='Highlight.TLabel')
        self.scrollbar = Scrollbar(frame)
        self.listbox = Listbox(frame, width=80, height=20, bg=AppStyle.LT_GRAY, selectmode=MULTIPLE,
                               yscrollcommand=self.scrollbar.set)
        self.add_button = ttk.Button(frame, text="geonames.org", command=self.web_handler, width=12)


        # Configure buttons and widgets
        self.configure_widgets()

        #self.frame.columnconfigure(0, weight=5)
        #self.frame.columnconfigure(2, weight=2)

        #self.frame.rowconfigure(0, weight=2)
        #self.frame.rowconfigure(1, weight=2)

        # Display data
        self.load_handler()
コード例 #13
0
    def read(self) -> bool:
        """
           Read in list of country names and ISO codes
        """
        if self.progress is not None:
            self.progress.update_progress(100, "Read ISO countries...")

        # list of all countries and their ISO codes
        # This also includes some common aliases
        self.geodb.db.begin()

        self.logger.debug(self.lang_list)

        #  Add country names to DB
        for ky, row in country_dict.items():
            # Localize country names to specified
            for lang in self.lang_list:
                # If we have a translation table for this language, then apply it
                if trans_table.get(lang):
                    tbl = trans_table.get(lang)
                    # Look up the country translation
                    if tbl.get(ky):
                        ky = tbl.get(ky)
                    break  # Apply first translation in list

            # Create Geo_row
            # ('paris', 'fr', '07', '012', '12.345', '45.123', 'PPL')
            geo_row = [None] * GeoDB.Entry.MAX
            geo_row[GeoDB.Entry.NAME] = GeoKeys.normalize(ky)
            sdx = phonetics.dmetaphone(geo_row[GeoDB.Entry.NAME])
            geo_row[GeoDB.Entry.SDX] = sdx[0]

            geo_row[GeoDB.Entry.ISO] = row[CnRow.ISO].lower()
            geo_row[GeoDB.Entry.ADM1] = ''
            geo_row[GeoDB.Entry.ADM2] = ''
            geo_row[GeoDB.Entry.LAT] = row[CnRow.LAT]
            geo_row[GeoDB.Entry.LON] = row[CnRow.LON]
            geo_row[GeoDB.Entry.FEAT] = 'ADM0'
            geo_row[GeoDB.Entry.ID] = row[CnRow.ISO].lower()

            self.geodb.insert(geo_row=geo_row, feat_code='ADM0')

        self.geodb.db.commit()
        return False
コード例 #14
0
ファイル: Loc.py プロジェクト: prculley/GeoFinder
    def get_five_part_title(self):
        # Returns a five part title string and tokenized version:
        #     prefix,city,county,state,country

        # Force type to City to generate four part title (then we add prefix for five parts)
        save_type = self.place_type
        self.place_type = PlaceType.CITY

        # Normalize country name
        save_country = self.country_name
        self.country_name, modified = GeoKeys.country_normalize(
            self.country_name)

        if len(self.extra) > 0:
            full_title = self.prefix + ' ' + self.extra + ',' + self.format_full_nm(
                None)
        else:
            full_title = self.prefix + ',' + self.format_full_nm(None)

        # Restore values to original
        self.place_type = save_type
        self.country_name = save_country

        return full_title
コード例 #15
0
    def write_updated_place(self, place: Loc.Loc, entry):
        # Write out updated location and lat/lon to  file
        self.geodata.geo_files.geodb.set_display_names(place)
        place.original_entry = place.format_full_nm(
            self.geodata.geo_files.output_replace_dct)
        prefix = GeoKeys.capwords(self.place.prefix)
        if self.diagnostics:
            self.in_diag_file.write(f'{entry}\n')

        if place.result_type != GeoKeys.Result.DELETE:
            # self.logger.debug(f'Write Updated - name={place.name} pref=[{place.prefix}]')

            self.ancestry_file_handler.write_updated(
                prefix + place.prefix_commas + place.original_entry, place)
            self.ancestry_file_handler.write_lat_lon(lat=place.lat,
                                                     lon=place.lon)
            text = prefix + place.prefix_commas + place.original_entry + '\n'
            text = str(text.encode('utf-8', errors='replace'))
            self.out_diag_file.write(text)
        else:
            # self.logger.debug('zero len, no output')
            if self.diagnostics:
                self.out_diag_file.write('DELETE\n')
            pass
コード例 #16
0
    def read_geoname(self) -> bool:
        # Read Geoname DB file - this is the db of geoname.org city files and is stored in cache directory under geonames_data
        # The db only contains important fields and only for supported countries
        # This file is much smaller and faster to read than the geoname files
        # If the db doesn't exist, read the geonames.org files and build it.
        # the UtilMain.py allows user changes to config parameters and then requires rebuild of db
        # if the user loads a new geonames.org file, we also need to rebuild the db

        # Use db if it exists and is newer than the geonames directory
        cache_dir = GeoKeys.get_cache_directory(self.directory)
        db_path = os.path.join(cache_dir, 'geodata.db')
        self.logger.debug(f'path for geodata.db: {db_path}')
        err_msg = ''

        # Validate Database setup
        if os.path.exists(db_path):
            # See if db is fresh (newer than other files)
            self.logger.debug(f'DB found at {db_path}')
            self.geodb = GeoDB.GeoDB(db_path=db_path,
                                     version=self.required_db_version)

            # Make sure DB is correct version
            ver = self.geodb.get_db_version()
            if ver != self.required_db_version:
                err_msg = f'Database version will be upgraded:\n\n{self.db_upgrade_text}\n\n' \
                    f'Upgrading database from V{ver} to V{self.required_db_version}.'
            else:
                # Correct Version.  Make sure DB is not stale
                dir_time = os.path.getmtime(self.directory)
                cache_time = os.path.getmtime(db_path)
                #if cache_time > dir_time:
                if True:
                    self.logger.info(f'DB is up to date')
                    # Ensure DB has reasonable number of records
                    count = self.geodb.get_row_count()
                    self.logger.info(f'Geoname entries = {count:,}')
                    if count < 1000:
                        # Error if DB has under 1000 records
                        err_msg = f'Geoname Database is too small.\n\n {db_path}\n\nRebuilding DB '
                if False:
                    # DB is stale
                    err_msg = f'DB {db_path} is older than geonames.org files.  Rebuilding DB '
                    if not messagebox.askyesno(
                            'Stale Database',
                            'Database is older than geonames.org files.\n\nRebuild database?'
                    ):
                        err_msg = ''
        else:
            err_msg = f'Database not found at\n\n{db_path}.\n\nBuilding DB'

        self.logger.debug(f'{err_msg}')
        if err_msg == '':
            # No DB errors detected
            self.geodb.create_indices()
            self.geodb.create_geoid_index()
            return False

        # DB error detected - rebuild database
        self.logger.debug('message box')
        messagebox.showinfo('Database Error', err_msg)
        self.logger.debug('message box done')

        # DB  error.  Rebuild it from geoname files
        self.logger.debug(err_msg)

        if os.path.exists(db_path):
            self.geodb.close()
            os.remove(db_path)
            self.logger.debug('Database deleted')

        self.geodb = GeoDB.GeoDB(db_path=db_path,
                                 version=self.required_db_version)
        self.country = Country.Country(self.progress_bar,
                                       geodb=self.geodb,
                                       lang_list=self.lang_list)

        # walk thru list of files ending in .txt e.g US.txt, FR.txt, all_countries.txt, etc
        file_count = 0

        # Put in country data
        self.country.read()

        start_time = time.time()

        # Set DB version as -1 for incomplete
        self.geodb.insert_version(-1)

        # Put in geonames file data
        for fname in [
                'allCountries.txt', 'ca.txt', 'gb.txt', 'de.txt', 'fr.txt',
                'nl.txt'
        ]:
            # Read all geoname files
            error = self.read_geoname_file(
                fname)  # Read in info (lat/long) for all places from

            if error:
                self.logger.error(f'Error reading geoname file {fname}')
            else:
                file_count += 1

        if file_count == 0:
            self.logger.error(
                f'No geonames files found in {os.path.join(self.directory, "*.txt")}'
            )
            return True

        # Put in alias names
        self.logger.info(
            f'geonames files done.  Elapsed ={time.time() - start_time}')

        start_time = time.time()
        self.alternate_names.read()
        self.logger.info(
            f'Alternate names done.  Elapsed ={time.time() - start_time}')
        self.logger.info(f'Geonames entries = {self.geodb.get_row_count():,}')

        start_time = time.time()
        self.progress("3) Final Step: Creating Indices for Database...", 95)
        self.geodb.create_geoid_index()
        self.geodb.create_indices()
        self.logger.debug(
            f'Indices done.  Elapsed ={time.time() - start_time}')
        self.geodb.insert_version(self.required_db_version)

        return False
コード例 #17
0
    def read_geoname_file(self, file) -> bool:  # , g_dict
        """Read in geonames files and build lookup structure

        Read a geoname.org places file and create a db of all the places.
        1. The db contains: Name, Lat, Long, district1ID (State or Province ID),
        district2_id, feat_code

        2. Since Geonames supports over 25M entries, the db is filtered to only the countries and feature types we want
        """
        Geofile_row = namedtuple(
            'Geofile_row',
            'id name name_asc alt lat lon feat_class feat_code iso iso2 admin1_id'
            ' admin2_id admin3_id admin4_id pop elev dem timezone mod')
        self.line_num = 0
        self.progress("Reading {}...".format(file), 0)
        path = os.path.join(self.directory, file)

        if os.path.exists(path):
            fsize = os.path.getsize(path)
            bytes_per_line = 128
            with open(path,
                      'r',
                      newline="",
                      encoding='utf-8',
                      errors='replace') as geofile:
                self.progress("Building Database from {}".format(file),
                              2)  # initialize progress bar
                reader = csv.reader(geofile, delimiter='\t')
                self.geodb.db.begin()

                # Map line from csv reader into GeonameData namedtuple
                for line in reader:
                    self.line_num += 1
                    if self.line_num % 20000 == 0:
                        # Periodically update progress
                        prog = self.line_num * bytes_per_line * 100 / fsize
                        self.progress(
                            msg=
                            f"1) Building Database from {file}            {prog:.1f}%",
                            val=prog)
                    try:
                        geoname_row = Geofile_row._make(line)
                    except TypeError:
                        self.logger.error(
                            f'Unable to parse geoname location info in {file}  line {self.line_num}'
                        )
                        continue

                    # Only handle line if it's  for a country we follow and its
                    # for a Feature tag we're interested in
                    if geoname_row.iso.lower() in self.supported_countries_dct and \
                            geoname_row.feat_code in self.feature_code_list_dct:
                        self.insert_georow(geoname_row)
                        if geoname_row.name.lower() != GeoKeys.normalize(
                                geoname_row.name):
                            self.geodb.insert_alternate_name(
                                geoname_row.name, geoname_row.id, 'ut8')

                    if self.progress_bar is not None:
                        if self.progress_bar.shutdown_requested:
                            # Abort DB build.  Clear out partial DB
                            self.geodb.clear_geoname_data()

            self.progress("Write Database", 90)
            self.geodb.db.commit()
            self.progress("Database created", 100)
            return False
        else:
            return True
コード例 #18
0
ファイル: Loc.py プロジェクト: prculley/GeoFinder
    def parse_place(self, place_name: str, geo_files):
        """
        Given a comma separated place name, parse into its city, AdminID, country_iso and type of entity (city, country etc)
        Expected format: prefix,city,admin2,admin1,country
        self.status has Result status code
        """
        self.clear()
        self.original_entry = place_name

        # Convert open-brace and open-paren to comma.  close brace/paren will be stripped by normalize()
        res = re.sub('\[', ',', place_name)
        res = re.sub('\(', ',', res)

        tokens = res.split(",")
        token_count = len(tokens)
        self.place_type = PlaceType.CITY

        # Parse City, Admin2, Admin2, Country scanning from the right.  When there are more tokens, we capture more fields
        # Place type is the leftmost item we found - either City, Admin2, Admin2, or Country
        # self.logger.debug(f'***** PLACE [{place_name}] *****')

        if '--' in place_name:
            # Pull out filter flags if present
            self.logger.debug('filter')
            self.filter(place_name, geo_files)
            return
        elif token_count > 0:
            #  COUNTRY - right-most token should be country
            #  Format: Country
            self.place_type = PlaceType.COUNTRY
            self.country_name = GeoKeys.search_normalize(tokens[-1], "")
            self.target = self.country_name

            # Validate country
            self.country_iso = geo_files.geodb.get_country_iso(
                self)  # Get Country country_iso
            if self.country_iso != '':
                # self.logger.debug(f'Found country. iso = [{self.country_iso}]')
                pass
            else:
                # Last token is not COUNTRY.
                # Append blank to token list so we now have xx,admin1, blank_country
                tokens.append('')
                token_count = len(tokens)
                self.result_type = GeoKeys.Result.NO_COUNTRY
                self.country_iso = ''
                self.country_name = ''

        if token_count > 1:
            #  Format: Admin1, Country.
            #  Admin1 is 2nd to last token
            self.admin1_name = GeoKeys.search_normalize(
                tokens[-2], self.country_iso)
            self.admin1_name = GeoKeys.admin1_normalize(
                self.admin1_name, self.country_iso)

            if len(self.admin1_name) > 0:
                self.place_type = PlaceType.ADMIN1
                self.target = self.admin1_name
                # Lookup Admin1
                geo_files.geodb.get_admin1_id(self)
                if self.admin1_id != '':
                    # self.logger.debug(f'Found admin1 {self.admin1_name}')
                    pass
                else:
                    # Last token is not Admin1 - append blank
                    self.admin1_name = ''
                    # Append blank token for admin1 position
                    tokens.append('')
                    token_count = len(tokens)

        if token_count == 3 and self.admin1_name == '' and self.country_name == '':
            # Just one valid token, so take as city
            self.city1 = GeoKeys.search_normalize(tokens[-3], self.country_iso)

            if len(self.city1) > 0:
                self.place_type = PlaceType.CITY
                self.target = self.city1
        elif token_count > 2:
            #  Format: Admin2, Admin1, Country
            #  Admin2 is 3rd to last.  Note -  if Admin2 isnt found, it will look it up as city
            self.admin2_name = GeoKeys.search_normalize(
                tokens[-3], self.country_iso)
            self.admin2_name, modif = GeoKeys.admin2_normalize(
                self.admin2_name, self.country_iso)

            if len(self.admin2_name) > 0:
                self.place_type = PlaceType.ADMIN2
                self.target = self.admin2_name

        if token_count > 3:
            # Format: Prefix, City, Admin2, Admin1, Country
            # City is 4th to last token
            # Other tokens go into Prefix
            self.city1 = GeoKeys.search_normalize(tokens[-4], self.country_iso)
            if len(self.city1) > 0:
                self.place_type = PlaceType.CITY
                self.target = self.city1

            # Assign remaining tokens (if any) to prefix.  Zero'th token to 4th from end.
            for item in tokens[0:-4]:
                if len(self.prefix) > 0:
                    self.prefix += ' '
                self.prefix += str(item.strip(' '))

        # Special case for New York, New York which normally refers to the City, not county
        if self.admin2_name == 'new york' and self.place_type == PlaceType.ADMIN2:
            self.admin2_name = 'new york city'
            self.target = self.admin2_name

        self.prefix = self.prefix.strip(',')

        self.logger.debug(
            f"    ======= PARSE: {place_name} City [{self.city1}] Adm2 [{self.admin2_name}]"
            f" Adm1 [{self.admin1_name}] adm1_id [{self.admin1_id}] Cntry [{self.country_name}] Pref=[{self.prefix}]"
            f" type_id={self.place_type}")
        return
コード例 #19
0
    def __init__(self, directory: str, progress_bar):
        self.logger = logging.getLogger(__name__)
        self.geodb = None
        self.required_db_version = 2
        self.db_upgrade_text = 'Adding support for non-English output'
        self.directory: str = directory
        self.progress_bar = progress_bar
        self.line_num = 0
        self.cache_changed: bool = False
        sub_dir = GeoKeys.get_cache_directory(self.directory)
        self.country = None

        # Read in dictionary listing Geoname features we should include
        self.feature_code_list_cd = CachedDictionary.CachedDictionary(
            sub_dir, "feature_list.pkl")
        self.feature_code_list_cd.read()
        self.feature_code_list_dct: Dict[str,
                                         str] = self.feature_code_list_cd.dict
        if len(self.feature_code_list_dct) < 3:
            self.logger.warning('Feature list is empty. Setting defaults')
            self.feature_code_list_dct.clear()
            feature_list = UtilFeatureFrame.default
            for feat in feature_list:
                self.feature_code_list_dct[feat] = ''
            self.feature_code_list_cd.write()

        # Read in dictionary listing countries (ISO2) we should include
        self.supported_countries_cd = CachedDictionary.CachedDictionary(
            sub_dir, "country_list.pkl")
        self.supported_countries_cd.read()
        self.supported_countries_dct: Dict[
            str, str] = self.supported_countries_cd.dict

        # Read in dictionary listing languages (ISO2) we should include
        self.languages_list_cd = CachedDictionary.CachedDictionary(
            sub_dir, "languages_list.pkl")
        self.languages_list_cd.read()
        self.languages_list_dct: Dict[str, str] = self.languages_list_cd.dict
        self.lang_list = []

        for item in self.languages_list_dct:
            self.lang_list.append(item)

        # Read in dictionary listing output text replacements
        self.output_replace_cd = CachedDictionary.CachedDictionary(
            sub_dir, "output_list.pkl")
        self.output_replace_cd.read()
        self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict
        self.output_replace_list = []

        for item in self.output_replace_dct:
            self.output_replace_list.append(item)
            self.logger.debug(f'Output replace [{item}]')

        self.entry_place = Loc.Loc()

        # Support for Geonames AlternateNames file.  Adds alternate names for entries
        self.alternate_names = AlternateNames.AlternateNames(
            directory_name=self.directory,
            geo_files=self,
            progress_bar=self.progress_bar,
            filename='alternateNamesV2.txt',
            lang_list=self.lang_list)
コード例 #20
0
    def match_score(self, inp_place: Loc.Loc, res_place: Loc.Loc) -> int:
        """
        :param inp_place: Input place structure with users text
        :param res_place: Result place structure with DB result
        :return: score 0-100 reflecting the difference between the user input and the result.  0 is perfect match, 100 is no match
        Score is also adjusted based on Feature type.  More important features (large city) get lower result
        """
        inp_len = [0] * 5
        num_inp_tokens = 0.0
        in_score = 0

        # Create full place title (prefix,city,county,state,country) from input place.
        inp_title = inp_place.get_five_part_title()
        inp_title = GeoKeys.normalize_match_title(inp_title,
                                                  inp_place.country_iso)
        inp_tokens = inp_title.split(',')

        # Create full place title (prefix,city,county,state,country) from result place
        res_place.prefix = ' '
        res_title = res_place.get_five_part_title()
        res_title = GeoKeys.normalize_match_title(res_title,
                                                  res_place.country_iso)
        res_tokens = res_title.split(',')

        # Store length of original input tokens.  This is used for percent unmatched calculation
        for it, tk in enumerate(inp_tokens):
            inp_tokens[it] = inp_tokens[it].strip(' ')
            inp_len[it] = len(inp_tokens[it])

        # Create a list of all the words in result and save result len for percent calc
        res_word_list = ', '.join(map(str, res_tokens))
        orig_res_len = len(res_word_list)

        # Create a list of all the words in input
        input_words = ', '.join(map(str, inp_tokens))

        # Remove any matching sequences in input list and result
        res_word_list, input_words = self.remove_matching_sequences(
            res_word_list, input_words)

        # For each input token calculate percent of new (unmatched) size vs original size
        unmatched_input_tokens = input_words.split(',')

        # Each token in place hierarchy gets a different weighting
        #      Prefix, city,county, state, country
        score_diags = ''

        # Calculate percent of USER INPUT text that was unmatched, then apply weighting
        for idx, tk in enumerate(inp_tokens):
            if inp_len[idx] > 0:
                unmatched_percent = int(
                    100.0 * len(unmatched_input_tokens[idx].strip(' ')) /
                    inp_len[idx])
                in_score += unmatched_percent * self.weight[idx]
                score_diags += f'  {idx}) [{tk}]{inp_len[idx]} {unmatched_percent}% * {self.weight[idx]} '
                # self.logger.debug(f'{idx}) Rem=[{unmatched_input_tokens[idx].strip(" " )}] wgtd={unmatched_percent * self.weight[idx]}')
                num_inp_tokens += 1.0 * self.weight[idx]
                # self.logger.debug(f'{idx} [{inp_tokens2[idx]}:{inp_tokens[idx]}] rawscr={sc}% orig_len={inp_len[idx]} wgt={self.weight[idx]}')
                if idx < 2:
                    # If the full first or second token of the result is in input then improve score
                    # Bonus for a full match as against above partial matches
                    if res_tokens[idx] in inp_tokens[idx]:
                        in_score -= self.first_token_match_bonus

        # Average over number of tokens (with fractional weight).  Gives 0-100% regardless of weighting and number of tokens
        in_score = in_score / num_inp_tokens
        # self.logger.debug(f'raw in={in_score}  numtkn={num_inp_tokens}')

        # Calculate percent of DB RESULT text that was unmatched
        if orig_res_len > 0:
            out_score = int(100.0 * len(res_word_list.strip(' ')) /
                            orig_res_len)
            # self.logger.debug(f"Out=[{res_word_list.strip(' ')}] orig_len={orig_res_len}")
        else:
            out_score = 0

        if not inp_place.standard_parse:
            # If Tokens were not in hierarchical order, give penalty
            parse_penalty = self.wrong_order_penalty
        else:
            parse_penalty = 0.0

        if '*' in inp_place.original_entry:
            # if it was a wildcard search it's hard to rank - add a penalty
            wildcard_penalty = self.wildcard_penalty
        else:
            wildcard_penalty = 0.0

        # Feature score is to ensure "important" places  get  higher rank (large city, etc)
        feature_score = Geodata.Geodata.get_priority(res_place.feature)

        # Add up scores - Each item is 0-100 and weighed as below
        in_weight = 1.0 - self.out_weight - self.feature_weight

        score = in_score * in_weight + out_score * self.out_weight + feature_score * self.feature_weight + parse_penalty + wildcard_penalty

        # self.logger.debug(f'SCORE {score:.1f} [{res_title}]  out={out_score * out_weight:.1f} '
        #                  f'in={in_score:.1f} feat={feature_score * feature_weight:.1f} parse={parse_penalty}\n {score_diags}')

        return score
コード例 #21
0
    def handle_place_entry(self):
        """ Get next PLACE  in users  File.  Replace it, skip it, or have user correct it. """
        self.w.original_entry.set_text("")

        if self.w.prog.shutdown_requested:
            self.periodic_update("Shutting down...")
        else:
            self.periodic_update("Scanning")
        self.clear_detail_text(self.place)

        while True:
            self.err_count += 1  # Counter is used to periodically update
            # Update statistics
            done = self.update_statistics()

            if self.ancestry_file_handler.place_total > 0:
                self.w.prog.update_progress(
                    100 * done / self.ancestry_file_handler.place_total, " ")
            else:
                self.w.prog.update_progress(0, " ")

            # Find the next PLACE entry in  file
            # Process it and keep looping until we need user input
            self.place.clear()
            town_entry, eof, rec_id = self.ancestry_file_handler.get_next_place(
            )
            town_entry = GeoKeys.semi_normalize(town_entry)
            self.place.id = rec_id

            if eof:
                self.end_of_file_shutdown()

            # See if we already have a fix (Global Replace) or Skip (ignore).
            # Otherwise have user handle it
            replacement_geoid = self.get_replacement(self.global_replace,
                                                     town_entry, self.place)

            if replacement_geoid is not None:
                # IN GLOBAL REPLACE LIST - There is a global change that we can apply to this line.
                self.matched_count += 1

                if self.place.result_type == GeoKeys.Result.STRONG_MATCH:
                    # Output updated place to ancestry file
                    self.write_updated_place(self.place, town_entry)
                    if self.place.prefix != '':
                        pass
                        # self.logger.debug(f'write upd prefix= [{self.place.original_entry}]')

                    # Display status to user
                    if self.w.prog.shutdown_requested:
                        self.periodic_update("Creating Import...")
                    else:
                        self.periodic_update("Applying change")
                elif self.place.result_type == GeoKeys.Result.DELETE:
                    continue
                else:
                    self.logger.warning(
                        f'***ERROR looking up GEOID=[{replacement_geoid}] for [{town_entry}] '
                    )
                    self.place.event_year = int(
                        self.ancestry_file_handler.event_year
                    )  # Set place date to event date (geo names change over time)
                    self.w.original_entry.set_text(
                        f'** DATABASE ERROR FOR GEOID=[{replacement_geoid}] for [{town_entry}]'
                    )
                    self.w.user_entry.set_text(f'{town_entry}')
                    self.geodata.find_location(town_entry, self.place,
                                               self.w.prog.shutdown_requested)
                    break
                continue
            elif self.skiplist.get(town_entry) is not None:
                # IN SKIPLIST - Write out as-is and go to next error
                self.skip_count += 1
                self.periodic_update("Skipping")
                self.ancestry_file_handler.write_asis(town_entry)
                continue
            else:
                # Found a  PLACE entry that we don't have a global replace or skip for
                # See if it is in our place database
                self.place.event_year = int(
                    self.ancestry_file_handler.event_year
                )  # Set place date to event date (geo names change over time)
                self.geodata.find_location(town_entry, self.place,
                                           self.w.prog.shutdown_requested)
                # if self.place.result_type not in GeoKeys.successful_match:
                #    self.geodata.set_last_iso('')
                #    self.geodata.find_location(town_entry, self.place)

                if self.place.result_type in GeoKeys.successful_match:
                    # FOUND A MATCH
                    if self.place.result_type == GeoKeys.Result.STRONG_MATCH:
                        # Strong match
                        self.matched_count += 1

                        # Write out line without user verification
                        if self.w.prog.shutdown_requested:
                            self.periodic_update("Creating Import...")
                        else:
                            self.periodic_update("Scanning")

                        # Add to global replace list - Use '@' for tokenizing.  Save GEOID_TOKEN and PREFIX_TOKEN
                        res = '@' + self.place.geoid + '@' + self.place.prefix

                        self.global_replace.set(town_entry, res)
                        self.logger.debug(
                            f'Found Strong Match for {town_entry} res= [{res}] Setting DICT'
                        )
                        # Periodically flush dictionary to disk.  (We flush on exit as well)
                        if self.err_count % 100 == 1:
                            self.global_replace.write()

                        self.write_updated_place(self.place, town_entry)
                        continue
                    else:
                        # Found match, but not a Strong Match
                        if self.w.prog.shutdown_requested:
                            # User requested shutdown.  Write this item out as-is
                            self.review_count += 1
                            self.periodic_update("Creating Import...")
                            self.w.original_entry.set_text(" ")
                            self.ancestry_file_handler.write_asis(town_entry)
                            continue
                        else:
                            # Have user review the match
                            self.logger.debug(
                                f'user review for {town_entry} res= [{self.place.result_type}] '
                            )

                            self.w.status.configure(style="Good.TLabel")
                            self.w.original_entry.set_text(
                                self.place.original_entry)  # Display place
                            self.w.user_entry.set_text(
                                self.place.original_entry)  # Display place
                            break
                else:
                    # No match
                    if self.w.prog.shutdown_requested:
                        # User requested shutdown.  Write this item out as-is
                        self.review_count += 1
                        self.periodic_update("Creating Import...")
                        self.w.original_entry.set_text(" ")
                        self.ancestry_file_handler.write_asis(town_entry)
                        continue
                    else:
                        # Have user review match
                        # self.logger.debug(f'User2 review for {town_entry}. status ={self.place.status}')

                        self.w.status.configure(style="Good.TLabel")
                        self.w.original_entry.set_text(
                            self.place.original_entry)  # Display place
                        self.w.user_entry.set_text(
                            self.place.original_entry)  # Display place
                        # Have user review the match
                        break

        # Have user review the result
        self.display_result(self.place)
コード例 #22
0
    def __init__(self):
        print('GeoFinder v{}'.format(__version__.__version__))
        print('Python {}.{}'.format(sys.version_info[0], sys.version_info[1]))

        if sys.version_info < (3, 6, 0):
            raise Exception("GeoFinder Requires Python 3.6 or higher.")
        val = ''
        print(f'GeoFinder Requires Python 3.6 or higher {val}')

        self.save_enabled = False  # Only allow SAVE when we have an item that was matched in geonames
        self.user_selected_list = False  # Indicates whether user selected a list entry or text edit entry
        self.err_count = 0
        self.matched_count = 0
        self.review_count = 0
        self.skip_count = 0
        self.odd = False
        self.ancestry_file_handler = None
        self.place = None
        self.skiplist = None
        self.global_replace = None
        self.geodata = None
        self.out_suffix = 'unknown_suffix'
        self.out_diag_file = None
        self.in_diag_file = None

        # initiate the parser
        parser = argparse.ArgumentParser()
        parser.add_argument("--logging", help="Enable quiet logging")
        parser.add_argument("--diagnostics", help="Create diagnostics files")

        # read arguments from the command line
        args = parser.parse_args()

        # check for --verbose switch
        if args.logging == 'info':
            self.logger = self.setup_logging_info('geofinder Init')
            self.logger.info(f"--logging set to INFO logging {args.logging}")
        else:
            self.logger = self.setup_logging('geofinder Init')

        # check for --diagnostics switch
        if args.diagnostics:
            self.logger.info(f"--diagnostics files enabled {args.diagnostics}")
            self.diagnostics = True
        else:
            self.diagnostics = False

        # Create App window and configure  window buttons and widgets
        self.w: AppLayout.AppLayout = AppLayout.AppLayout(self)
        self.w.create_initialization_widgets()
        self.w.config_button.config(state="normal")

        # Get our base directory path from INI file.  Create INI if it doesnt exist
        home_path = str(Path.home())
        self.directory = Path(
            os.path.join(home_path, GeoKeys.get_directory_name()))
        self.ini_handler = IniHandler(home_path=home_path,
                                      ini_name='geofinder.ini')
        self.directory = self.ini_handler.get_directory_from_ini()

        self.cache_dir = GeoKeys.get_cache_directory(self.directory)
        self.logger.info(f'Cache directory {self.cache_dir}')

        # Set up configuration  class
        self.cfg = Config.Config(self.directory)
        self.util = UtilLayout.UtilLayout(root=self.w.root,
                                          directory=self.directory,
                                          cache_dir=self.cache_dir)

        if not os.path.exists(self.cache_dir):
            # Create directories for GeoFinder
            if messagebox.askyesno(
                    'Geoname Data Cache Folder not found',
                    f'Create Geoname Cache folder?\n\n{self.cache_dir} '):
                err = self.cfg.create_directories()
                if not os.path.exists(self.cache_dir):
                    messagebox.showwarning(
                        'Geoname Data Cache Folder not found',
                        f'Unable to create folder\n\n{self.cache_dir} ')
                    self.shutdown()
                else:
                    self.logger.debug(f'Created {self.cache_dir}')
                    messagebox.showinfo(
                        'Geoname Data Cache Folder created',
                        f'Created folder\n\n{self.cache_dir} ')
            else:
                self.shutdown()

        # Ensure GeoFinder directory structure is valid
        if self.cfg.valid_directories():
            # Directories are valid.  See if  required Geonames files are present
            err = self.check_configuration()
            if err:
                # Missing files
                self.logger.warning('Missing files')
                self.w.status.set_text("Click Config to set up Geo Finder")
                TKHelper.set_preferred_button(self.w.config_button,
                                              self.w.initialization_buttons,
                                              "Preferred.TButton")
                self.w.load_button.config(state="disabled")
            else:
                # No config errors
                # Read config settings (Ancestry file path)
                err = self.cfg.read()
                if err:
                    self.logger.warning('error reading {} config.pkl'.format(
                        self.cache_dir))

                self.w.original_entry.set_text(self.cfg.get("gedcom_path"))
                TKHelper.enable_buttons(self.w.initialization_buttons)
                if os.path.exists(self.cfg.get("gedcom_path")):
                    #  file is valid.  Prompt user to click Open for  file
                    self.w.status.set_text(
                        f"Click Open to load {file_types} file")
                    TKHelper.set_preferred_button(
                        self.w.load_button, self.w.initialization_buttons,
                        "Preferred.TButton")
                else:
                    # No file.  prompt user to select a  file - GEDCOM file name isn't valid
                    self.w.status.set_text(f"Choose a {file_types} file")
                    self.w.load_button.config(state="disabled")
                    TKHelper.set_preferred_button(
                        self.w.choose_button, self.w.initialization_buttons,
                        "Preferred.TButton")
        else:
            # Missing directories
            self.logger.warning('Directories not found: {} '.format(
                self.cache_dir))
            self.w.status.set_text("Click Config to set up Geo Finder")
            self.w.load_button.config(state="disabled")
            TKHelper.set_preferred_button(self.w.config_button,
                                          self.w.initialization_buttons,
                                          "Preferred.TButton")

        # Flag to indicate whether we are in startup or in Window loop.  Determines how window idle is called
        self.startup = False
        self.w.root.mainloop(
        )  # ENTER MAIN LOOP and Wait for user to click on load button