Beispiel #1
0
    def __init__(self):
        """
        Init
        """
        self.logger = logging.getLogger(__name__)
        self.original_entry: str = ""
        self.lat: float = float('NaN')  # Latitude
        self.lon: float = float('NaN')  # Longitude
        self.country_iso: str = ""  # Country ISO code
        self.country_name: str = ''
        self.city: str = ""  # City or entity name
        self.admin1_name: str = ""  # Admin1 (State/province/etc)
        self.admin1_id: str = ""  # Admin1 Geoname ID
        self.admin2_name: str = ""  # Admin2 (county)
        self.admin2_id = ""  # Admin2 Geoname ID
        self.prefix: str = ""  # Prefix (entries prepended before geoname location)
        self.feature: str = ''  # Geoname feature code
        self.place_type: int = PlaceType.COUNTRY  # Is this a Country , Admin1 ,admin2 or city?
        self.geoid: str = ''  # Geoname GEOID
        self.enclosed_by = ''  # The entity that encloses this.  E.g United States encloses Texas
        self.updated_entry = ''
        self.score = 100.0
        self.norm = Normalize.Normalize()

        # Lookup result info
        self.status: str = ""
        self.status_detail: str = ""
        self.result_type: int = GeoUtil.Result.NO_MATCH  # Result type of lookup
        self.result_type_text: str = ''  # Text version of result type
        self.georow_list: List = []  # List of items that matched this location
        self.event_year: int = 0
        self.geo_db = None
Beispiel #2
0
    def update_global_replacement_list(self, key, geoid, prefix):
        res = ReplacementDictionary.build_replacement_entry(geoid, prefix)
        ky = Normalize.normalize(text=key, remove_commas=False)
        self.global_replace.set(ky, res)

        # Periodically flush dictionary to disk
        if self.update_counter % 10 == 1:
            self.global_replace.write()
Beispiel #3
0
 def __init__(self, geodb):
     self.logger = logging.getLogger(__name__)
     self.detailed_debug = True
     self.start = 0
     self.use_wildcards = True
     self.total_lookups = 0
     self.cache = {}
     self.place_type = ''
     self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx'
     self.geodb = geodb
     self.match = MatchScore.MatchScore()
     self.norm = Normalize.Normalize()
     self.place = Loc.Loc()
Beispiel #4
0
def get_soundex(text) -> str:
    """
    Returns: Phonetics Double Metaphone Soundex code for sorted words in text  
    Words are alpha sorted but stop words are forced to end
    First two actual letters of word are prepended
    """
    sdx = []
    word_list = Normalize.sorted_normalize(text)

    for word in word_list:
        sdx.append(get_word_soundex(word))

    res = ' '.join(sdx)
    if len(res) == 0:
        res = text
    return res.lower()
Beispiel #5
0
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.score_diags = ''  # Diagnostic text for scoring
        self.token_weight = []
        self.prefix_weight = 0.0
        self.feature_weight = 0.0
        self.input_weight = 0.0

        # Weighting for each input term match -  adm2, adm1, country
        token_weights = [.2, .3, .5]
        self.set_weighting(token_weight=token_weights,
                           prefix_weight=6.0,
                           feature_weight=0.15)

        # Weighting for each part of score
        self.wildcard_penalty = 8.0
        self.norm = Normalize.Normalize()
Beispiel #6
0
    def __init__(self, db_path, show_message: bool, exit_on_error: bool,
                 set_speed_pragmas: bool, db_limit: int):
        """
            geoname data database init. Open database if present otherwise raise error
        # Args:
            db_path: full path to database file
            show_message: If True, show messagebox to user on error
            exit_on_error: If True, exit if significant error occurs
            set_speed_pragmas: If True, set DB pragmas for maximum performance. 
            db_limit: SQL LIMIT parameter
        # Raises:
            ValueError('Cannot open database'), ValueError('Database empty or corrupt')
        """
        self.logger = logging.getLogger(__name__)
        self.show_message = show_message
        self.exit_on_error = exit_on_error
        self.max_query_results = 50
        self.total_time = 0
        self.total_lookups = 0
        self.slow_lookup = 0
        self.match = MatchScore.MatchScore()
        self.norm = Normalize.Normalize()

        #self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx'
        self.db_path = db_path

        # See if DB exists
        if os.path.exists(db_path):
            db_existed = True
        else:
            db_existed = False

        self.db = DB.DB(db_filename=db_path,
                        show_message=show_message,
                        exit_on_error=exit_on_error)
        if self.db.err != '':
            self.logger.error(f"Error! cannot open database {db_path}.")
            raise ValueError('Cannot open database')

        # If DB was initially found
        if db_existed:
            # Run sanity test on DB
            err = self.db.test_database('name',
                                        'main.geodata',
                                        where='name = ? AND country = ?',
                                        args=('ba', 'fr'))

            if err:
                # DB failed sanity test
                self.logger.warning(f'DB error for {db_path}')

                if show_message:
                    if messagebox.askyesno(
                            'Error',
                            f'Geoname database is empty or corrupt:\n\n {db_path} \n\nDo you want to delete it and rebuild?'
                    ):
                        messagebox.showinfo('', 'Deleting Geoname database')
                        self.db.conn.close()
                        os.remove(db_path)
                if exit_on_error:
                    sys.exit()
                else:
                    raise ValueError('Database empty or corrupt')

        if set_speed_pragmas:
            self.db.set_speed_pragmas()

        self.db_limit = db_limit
        self.db.order_string = ''
        self.db.limit_string = f'LIMIT {self.db_limit}'
        self.place_type = ''
        self.s: GeoSearch.GeoSearch = GeoSearch.GeoSearch(geodb=self)
Beispiel #7
0
    def __init__(self,
                 directory: str,
                 display_progress,
                 show_message: bool,
                 exit_on_error: bool,
                 languages_list_dct: {},
                 feature_code_list_dct: {},
                 supported_countries_dct: {},
                 volume=''):
        """
        Read in datafiles needed for geodata, filter them and create a sql db.
        Filter dictionary examples:   
            languages_list_dct={'fr','de'}
            feature_code_list_dct={'PPL', 'ADM1', 'CSTL'}
            supported_countries_dct = {'us','gb','at'}
        # Args:
            directory: base directory
            display_progress: None or Handler called with percent_done:int, msg:str
            show_message: True to show message boxes to user on errors
            exit_on_error:  True to exit on serious errors
            languages_list_dct: dictionary containing the ISO-2 languages  to load from alternateNames
            feature_code_list_dct: dictionary containing the Geonames.org feature codes to load
            supported_countries_dct: dictionary containing the ISO-2 countries to load
            volume: disk volume to use - e.g. C: for Windows or /Volumes/xyz for OSX, /media/xyz for linux
        """
        self.logger = logging.getLogger(__name__)
        self.geodb: [GeoDB.GeoDB, None] = None
        self.show_message = show_message
        self.geoid_main_dict = {}  # Key is GEOID, Value is DB ID for entry
        self.geoid_admin_dict = {}  # Key is GEOID, Value is DB ID for entry
        # TODO fix volume handling
        self.volume = volume
        self.collate = 'COLLATE NOCASE'

        self.exit_on_error = exit_on_error
        self.required_db_version = 4
        # Message to user upgrading from earlier DB version
        self.db_upgrade_text = 'Renamed column to Feature'
        self.directory: str = directory
        self.progress_bar = display_progress
        self.line_num = 0
        self.cache_changed: bool = False
        sub_dir = GeoUtil.get_cache_directory(self.directory)
        self.country = None
        self.languages_list_dct = languages_list_dct
        self.feature_code_list_dct = feature_code_list_dct
        self.supported_countries_dct = supported_countries_dct
        self.lang_list = []
        self.norm = Normalize.Normalize()

        for item in self.languages_list_dct:
            self.lang_list.append(item)

        if volume != '':
            os.chdir(volume)
        if not os.path.exists(sub_dir):
            self.logger.warning(f'Directory] {sub_dir} NOT FOUND')
            if self.show_message:
                messagebox.showwarning(
                    'Folder not found',
                    f'Directory\n\n {sub_dir}\n\n NOT FOUND')
            if exit_on_error:
                sys.exit()

        # Read in Text Replacement dictionary pickle - this has output text replacements
        self.output_replace_cd = CachedDictionary.CachedDictionary(
            sub_dir, "output_list.pkl")
        self.output_replace_cd.read()
        self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict
        self.output_replace_list = []

        for item in self.output_replace_dct:
            self.output_replace_list.append(item)

        self.entry_place = Loc.Loc()

        # Support for Geonames AlternateNames file.  Adds alternate names for entries
        self.alternate_names = AlternateNames.AlternateNames(
            directory=self.directory,
            geo_build=self,
            progress_bar=self.progress_bar,
            prefix="Step 3 of 4) ",
            filename='alternateNamesV2.txt',
            lang_list=self.lang_list)
Beispiel #8
0
    def process_place_entries(self):
        """
        Handle PLACE entries in users file.  Replace it, skip it, or have user correct it.
        """
        self.w.original_entry.text = ""

        if self.w.prog.shutdown_requested:
            self.periodic_update("Shutting down...")
        else:
            self.periodic_update("Scanning")
        self.clear_result_list(self.place)

        while True:
            # Keep reading place entries until we need User review or reach End Of File
            self.update_counter += 1  # Counter is used to periodically update
            # Update statistics
            self.update_statistics()

            # Find the next PLACE entry in  file
            # Process it and keep looping until we need user review
            self.place.clear()
            town_entry, eof, rec_id = self.ancestry_file_handler.get_next_place(
            )
            self.place.updated_entry = town_entry
            self.place.id = rec_id
            town_entry = Normalize.normalize(text=town_entry,
                                             remove_commas=False)

            if eof:
                self.end_of_file_shutdown()

            # See if we already have a fix (Global Replace) or Skip (ignore).
            # Otherwise see if we can find it or have user handle it
            replacement_geoid = self.get_replacement(self.global_replace,
                                                     town_entry, self.place)

            if replacement_geoid is not None:
                # There is already a global change that we can apply to this entry.
                self.matched_count += 1

                if self.place.result_type == GeoUtil.Result.STRONG_MATCH:
                    # REPLACE - Output the updated place to ancestry file
                    self.write_updated_place(self.place, town_entry)

                    # Display status to user
                    if self.w.prog.shutdown_requested:
                        self.periodic_update("Creating Import...")
                    else:
                        self.periodic_update("Applying change")
                elif self.place.result_type == GeoUtil.Result.DELETE:
                    # DELETE - Don't write out this place
                    continue
                else:
                    # ERROR - We previously found an update, but the GEOID for replacement can no longer be found
                    self.logger.warning(
                        f'***ERROR looking up GEOID=[{replacement_geoid}] for [{town_entry}] '
                    )
                    self.place.event_year = int(
                        self.ancestry_file_handler.event_year
                    )  # Set place date to event date (geo names change over time)
                    self.w.original_entry.text = f'** DATABASE ERROR FOR GEOID=[{replacement_geoid}] for [{town_entry}]'
                    self.w.user_entry.text = f'{town_entry}'
                    self.geodata.find_matches(town_entry, self.place,
                                              self.w.prog.shutdown_requested)
                    break
                continue
            elif self.skiplist.get(town_entry) is not None:
                # SKIP - User marked place as SKIP - Write out as-is and go to next error
                self.skip_count += 1
                self.periodic_update("Skipping")
                self.ancestry_file_handler.write_asis(town_entry)
                continue
            else:
                # FOUND a PLACE entry that we don't already have a global replace or skip for
                # See if it is in the place database
                self.place.event_year = int(
                    self.ancestry_file_handler.event_year
                )  # Set place date to event date (geo names change over time)
                self.geodata.find_matches(town_entry, self.place,
                                          self.w.prog.shutdown_requested)

                if self.place.result_type in GeoUtil.successful_match:
                    # STRONG MATCH
                    if self.place.result_type == GeoUtil.Result.STRONG_MATCH:
                        # FOUND STRONG MATCH - no user verification needed
                        self.matched_count += 1

                        # Write out line without user verification
                        if self.w.prog.shutdown_requested:
                            self.periodic_update("Creating Import...")
                        else:
                            self.periodic_update("Scanning")

                        # Add to global replace list
                        self.update_global_replacement_list(
                            key=town_entry,
                            geoid=self.place.geoid,
                            prefix=self.place.prefix)
                        self.write_updated_place(self.place, town_entry)
                        self.logger.debug(
                            f'Found Strong Match for {town_entry} Setting DICT'
                        )
                        continue
                    else:
                        # WEAK MATCH OR MULTIPLE MATCHES
                        if self.w.prog.shutdown_requested:
                            # User requested shutdown - so no user interaction.  Write this item out as-is
                            self.review_count += 1
                            self.periodic_update("Creating Import...")
                            self.w.original_entry.text = " "
                            self.ancestry_file_handler.write_asis(town_entry)
                            continue
                        else:
                            # USER REVIEW - Have user review the match
                            self.logger.debug(
                                f'user review for {town_entry} res= [{self.place.result_type}] '
                            )

                            self.w.status.configure(style="Good.TLabel")
                            self.w.original_entry.text = self.place.original_entry  # Display place
                            self.w.user_entry.text = self.place.updated_entry  # Display place
                            # Break out of loop and have user review the match
                            break
                else:
                    # NO MATCH FOUND
                    if self.w.prog.shutdown_requested:
                        # User requested shutdown.  Write this item out as-is
                        self.review_count += 1
                        self.periodic_update("Creating Import...")
                        self.w.original_entry.text = " "
                        self.ancestry_file_handler.write_asis(town_entry)
                        continue
                    else:
                        # USER REVIEW - Have user review entry
                        # self.logger.debug(f'User2 review for {town_entry}. status ={self.place.status}')
                        self.w.status.configure(style="Good.TLabel")
                        self.w.original_entry.text = self.place.original_entry  # Display place
                        self.w.user_entry.text = self.place.original_entry  # Display place
                        # Break out of loop and have user review the item
                        break

        # Have user review the result
        self.display_result(self.place)
Beispiel #9
0
    def load_data_files(self) -> bool:
        """
        Load in data files required for GeoFinder:
        Load global_replace dictionary, Geodata files and geonames
        #Returns:
            Error - True if error occurred
        """
        # Read in Skiplist, Replace list
        self.skiplist = CachedDictionary.CachedDictionary(
            self.cache_dir, "skiplist.pkl")
        self.skiplist.read()
        self.global_replace = CachedDictionary.CachedDictionary(
            self.cache_dir, "global_replace.pkl")
        self.global_replace.read()
        dict_copy = copy.copy(self.global_replace.dict)

        # Convert all global_replace items to lowercase
        for ky in dict_copy:
            val = self.global_replace.dict.pop(ky)
            new_key = Normalize.normalize(text=ky, remove_commas=False)
            self.global_replace.dict[new_key] = val

        # Read in dictionary listing Geoname features we should include
        self.feature_code_list_cd = CachedDictionary.CachedDictionary(
            self.cache_dir, "feature_list.pkl")
        self.feature_code_list_cd.read()
        feature_code_list_dct: Dict[str, str] = self.feature_code_list_cd.dict
        if len(feature_code_list_dct) < 3:
            self.logger.warning('Feature list is empty.')
            feature_code_list_dct.clear()
            feature_list = UtilFeatureFrame.default
            for feat in feature_list:
                feature_code_list_dct[feat] = ''
            self.feature_code_list_cd.write()

        # Read in dictionary containing countries (ISO2) we should include
        self.supported_countries_cd = CachedDictionary.CachedDictionary(
            self.cache_dir, "country_list.pkl")
        self.supported_countries_cd.read()
        supported_countries_dct: Dict[str,
                                      str] = self.supported_countries_cd.dict

        # Read in dictionary containing languages (ISO2) we should include
        self.languages_list_cd = CachedDictionary.CachedDictionary(
            self.cache_dir, "languages_list.pkl")
        self.languages_list_cd.read()
        languages_list_dct: Dict[str, str] = self.languages_list_cd.dict

        # Initialize geo data
        self.geodata = Geodata(directory_name=self.directory,
                               progress_bar=self.w.prog,
                               enable_spell_checker=self.enable_spell_checker,
                               show_message=True,
                               exit_on_error=True,
                               languages_list_dct=languages_list_dct,
                               feature_code_list_dct=feature_code_list_dct,
                               supported_countries_dct=supported_countries_dct)

        # If the list of supported countries is unusually short, display note to user
        num = self.display_country_note()
        self.logger.info('{} countries will be loaded'.format(num))

        # open Geoname Gazeteer DB - city names, lat/long, etc.
        error = self.geodata.open()
        if error:
            TKHelper.fatal_error(MISSING_FILES)

        self.w.root.update()
        self.w.prog.update_progress(100, " ")
        return error