Ejemplo n.º 1
0
    def find_geoid(self, geoid: str, place: Loc) -> None:
        """
        Lookup by geoid   
        #Args:   
            geoid:  Geonames.org geoid
            place:  Location fields in place are updated

        #Returns: None. Location fields in place are updated

        """
        flags = ResultFlags(limited=False, filtered=False)
        place.geoid = geoid
        place.georow_list.clear()
        self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list,
                                            geoid=place.geoid,
                                            place=place)
        if len(place.georow_list) == 0:
            self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list,
                                                geoid=place.geoid,
                                                place=place,
                                                admin=True)

        if len(place.georow_list) > 0:
            place.result_type = GeoUtil.Result.STRONG_MATCH
            self.process_results(place=place, flags=flags)
            # self.logger.debug(f'found geoid {place.georow_list[0]}')
        else:
            place.result_type = GeoUtil.Result.NO_MATCH
Ejemplo n.º 2
0
    def process_results(self, place: Loc, flags) -> None:
        """
            Update fields in place record using first entry in place.georow_list   
            Updates fields with available data: city, admin1, admin2, country, lat/long, feature, etc.   
        #Args:    
            place: Loc instance   
            flags: Flags tuple as returned by sort_results   

        #Returns:    
            None.  place instance fields are updated   
        """
        # self.logger.debug(f'**PROCESS RESULT:  Res={place.result_type}   Georow_list={place.georow_list}')
        if place.result_type == GeoUtil.Result.NOT_SUPPORTED:
            place.place_type = Loc.PlaceType.COUNTRY

        if place.result_type in GeoUtil.successful_match and len(
                place.georow_list) > 0:
            self.geo_build.geodb.copy_georow_to_place(row=place.georow_list[0],
                                                      place=place,
                                                      fast=False)
        elif len(place.georow_list
                 ) > 0 and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
            # self.logger.debug(f'***RESULT={place.result_type} Setting to Partial')
            place.result_type = GeoUtil.Result.PARTIAL_MATCH

        place.set_place_type_text()
Ejemplo n.º 3
0
    def fast_score(self, target_place: Loc, result_place: Loc) -> float:
        # Get a rough, fast score for similarity between target and result.  O is best.  100 is worst
        result_title = result_place.get_five_part_title()
        target_title = target_place.get_five_part_title()
        #self.logger.debug(f'F Score  Result [{result_title}] targ [{target_title}] ')

        sc = 100 - fuzz.token_sort_ratio(result_title, target_title)

        #self.logger.debug(f'F Score={sc:.2f} Result [{result_title}] targ [{target_title}] ')
        return sc
Ejemplo n.º 4
0
    def _find_type_as_city(self, place: Loc, typ) -> int:
        """
            Do a lookup using the field specifed by typ as a city name.  E.g. if typ is PlaceType.ADMIN1 then   
            use the place.admin1_name field to do the city lookup   
        #Args:   
            place: Loc instance   
            typ: Loc.PlaceType - Specifies which field to use as target for lookup   

        #Returns:  None   
            place.georow_list is updated with matches   
        """
        # place.standard_parse = False
        typ_name = ''
        best = 999
        if typ == Loc.PlaceType.CITY:
            # Try City as city (do as-is)
            typ_name = 'City'
            pass
        elif typ == Loc.PlaceType.ADMIN2:
            # Try ADMIN2 as city
            if place.admin2_name != '':
                # if '*' not in place.city:
                #    place.prefix += ' ' + place.city
                place.city = place.admin2_name
                place.admin2_name = ''
                typ_name = 'Admin2'
        elif typ == Loc.PlaceType.PREFIX:
            # Try Prefix as City
            if place.prefix != '':
                place.city = place.prefix
                # if '*' not in tmp:
                #    place.prefix = tmp
                typ_name = 'Prefix'
        elif typ == Loc.PlaceType.ADVANCED_SEARCH:
            # Advanced Search
            best = self.geo_build.geodb.lookup_place(place=place)
            return best
        else:
            self.logger.warning(f'Unknown TYPE {typ}')

        if typ_name != '':
            result_list = []
            self.logger.debug(
                f'2) Try {typ_name} as City.  Target={place.city}  pref [{place.prefix}] '
            )

            place.place_type = Loc.PlaceType.CITY
            best = self.geo_build.geodb.s.lookup_place(place=place)

            #best_score = self.geo_build.geodb.assign_scores(result_list, place, '', fast=True, quiet=False)
            self.logger.debug(f'best={best}')
            if best >= MatchScore.Score.POOR_CUTOFF:
                self.logger.debug('--- DEEP SEARCH ADM2 ---')
                best = self.geo_build.geodb.s.deep_lookup(place=place)
        return best
Ejemplo n.º 5
0
    def run_test_score(idx) -> int:
        in_place = Loc.Loc()
        res_place = Loc.Loc()

        TestScoring.prepare_test(idx, in_place, res_place)
        score = TestScoring.scoring.match_score(in_place, res_place)
        in_title = MatchScore.full_normalized_title(in_place)
        res_title = MatchScore.full_normalized_title(res_place)

        TestScoring.logger.debug(f'     {idx}) {score:.1f} In=[{in_title}] Out=[{res_title}]')
        return score
Ejemplo n.º 6
0
 def full_normalized_title(self, place: Loc) -> str:
     # Create a full normalized five part title (includes prefix)
     # Clean up prefix - remove any words that are in city, admin1 or admin2 from Prefix
     # place.prefix = Loc.Loc.matchscore_prefix(place.prefix, place.get_long_name(None))
     title = place.get_five_part_title()
     title = self.norm.normalize_for_scoring(title)
     return title
Ejemplo n.º 7
0
    def __init__(self,
                 directory_name: str,
                 display_progress,
                 show_message: bool,
                 exit_on_error: bool,
                 languages_list_dct,
                 feature_code_list_dct,
                 supported_countries_dct,
                 volume=''):
        """
            Init

        #Args:
            directory_name: directory where geoname.org files are.  DB will be in 'cache' folder under this   
            display_progress: None or function to display progress(percent_done:int, msg:str)  
            show_message: If True - show TKInter message dialog on error   
            exit_on_error: If True - exit on significant error   
            languages_list_dct: Dictionary of ISO-2 languages to import from AlternateNamesV2.txt   
            feature_code_list_dct: Dictionary of Geoname Feature codes to import into DB   
            supported_countries_dct: Dictionary of ISO-2 Country codes to import into DB   
        """
        self.logger = logging.getLogger(__name__)
        self.display_progress = display_progress
        self.save_place: Loc = Loc.Loc()
        self.miss_diag_file = None
        self.distance_cutoff = 0.6  # Value to determine if two lat/longs are similar based on Rectilinear Distance
        self.geo_build = GeodataBuild.GeodataBuild(
            str(directory_name),
            display_progress=self.display_progress,
            show_message=show_message,
            exit_on_error=exit_on_error,
            languages_list_dct=languages_list_dct,
            feature_code_list_dct=feature_code_list_dct,
            supported_countries_dct=supported_countries_dct,
            volume=volume)
Ejemplo n.º 8
0
    def load_handler(self):
        """
        The User pressed the LOAD button to load and review an Ancestry file. Switch app display to the Review layout
        Load in file name and call handle_place_entry()
        """
        self.w.original_entry.text = ""
        self.w.remove_initialization_widgets()  # Remove old widgets
        self.w.create_review_widgets(
        )  # Switch display from Initial widgets to review widgets
        self.load_data_files()
        ged_path = self.cfg.get(
            "gedcom_path")  # Get saved config setting for  file

        # Load appropriate ancestry file handler based on file type (Gramps XML or GEDCOM)
        if ged_path is not None:
            if '.ged' in ged_path:
                # GEDCOM
                self.out_suffix = "import.ged"
                self.ancestry_file_handler = Gedcom.Gedcom(
                    in_path=ged_path,
                    out_suffix=temp_suffix,
                    cache_d=self.cache_dir,
                    progress=None)  # Routines to open and parse GEDCOM file
            elif '.gramps' in ged_path:
                # GRAMPS
                self.out_suffix = "import.gramps"
                # self.out_suffix = "csv"
                self.ancestry_file_handler = GrampsXml.GrampsXml(
                    in_path=ged_path,
                    out_suffix=temp_suffix,
                    cache_d=self.cache_dir,
                    progress=None,
                    geodata=self.geodata
                )  # Routines to open and parse Gramps file
        else:
            self.out_suffix = 'unk.new.ged'
            messagebox.showwarning(
                f'UNKNOWN File type. Not .gramps and not .ged. \n\n{ged_path}')

        self.out_diag_file = open(ged_path + '.output.txt', 'wt')
        self.in_diag_file = open(ged_path + '.input.txt', 'wt')
        miss_diag_fname = ged_path + '.miss.txt'
        self.geodata.open_diag_file(miss_diag_fname)

        if self.ancestry_file_handler.error:
            TKHelper.fatal_error(f"File {ged_path} not found.")

        self.w.root.update()
        self.place: Loc.Loc = Loc.Loc(
        )  # Create an object to store info for the current Place

        # Add  filename to Title
        path_parts = os.path.split(ged_path)  # Extract filename from full path
        self.w.title.text = f'GEO FINDER v{__version__.__version__} - {path_parts[1]}'

        # Read  file, find each place entry and handle it.
        self.w.user_entry.text = "Scanning to previous position..."
        self.process_place_entries()
Ejemplo n.º 9
0
    def find_best_match(self, location: str, place: Loc) -> bool:
        """
            Find the best scoring match for this location in the geoname dictionary.  
        #Args:  
            location:  location name, e.g. Los Angeles, California, USA   
            place:  Loc instance   
        #Returns: True if a match was found     
            place is updated with -- lat, lon, district, city, country_iso, result code  
        """
        #  First parse the location into <prefix>, city, <district2>, district1, country.
        #  Then look it up in the place db

        res = self.find_matches(location, place)

        # Clear to just best entry
        flags = self.filter_results(place)
        # If multiple matches, truncate to first match
        if len(place.georow_list) > 0:
            place.georow_list = place.georow_list[:1]
            self.process_results(place=place, flags=flags)
            place.set_place_type()

            nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}'
            place.prefix = place.prefix_cleanup(place.prefix, nm)
            return True
        else:
            if res in GeoUtil.successful_match:
                nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}'
                place.prefix = place.prefix_cleanup(place.prefix, nm)
                #print(f'Found pre=[{place.prefix}{place.prefix_commas}] Nam=[{nm}]')
                return True

            return False
Ejemplo n.º 10
0
    def run_test_inscore(idx) -> int:
        target_place = Loc.Loc()
        result_place = Loc.Loc()

        TestScoring.logger.debug(f'TEST INPUT SCORE:')

        TestScoring.prepare_test(idx, target_place, result_place)
        TestScoring.logger.debug(f'prepare_test: INP={target_place.city},{target_place.admin2_name},'
                                 f'{target_place.admin1_name},{target_place.country_name}'
                                 f' RES={result_place.city},{result_place.admin2_name},'
                                 f'{result_place.admin1_name},{result_place.country_name}')

        # Create full, normalized titles (prefix,city,county,state,country)
        result_title, result_tokens, target_title, target_tokens = MatchScore._prepare_input(target_place, result_place)

        # Calculate score for  percent of input target text that matched result
        sc = TestScoring.scoring._calculate_weighted_score(target_tokens, result_tokens)

        print(f'#{idx} SCORE={sc:.1f} In={sc:.1f}[{target_place.original_entry.title().lower()}] [{result_place.get_five_part_title()}]')
        return sc
Ejemplo n.º 11
0
    def _lookup_city_as_admin2(self, place: Loc, result_list) -> int:
        """
        Lookup place.city as admin2 name   
        #Args:   
            place:     
            result_list:   

        #Returns:   

        """
        # Try City as ADMIN2
        # place.standard_parse = False
        place.admin2_name = place.city
        place.city = ''
        place.place_type = Loc.PlaceType.ADMIN2
        self.logger.debug(
            f'  Try admin2  [{place.admin2_name}] as city [{place.get_five_part_title()}]'
        )
        best = self.geo_build.geodb.lookup_place(place=place)
        result_list.extend(place.georow_list)
        return best
Ejemplo n.º 12
0
 def __init__(self, geodb):
     self.logger = logging.getLogger(__name__)
     self.detailed_debug = True
     self.start = 0
     self.use_wildcards = True
     self.total_lookups = 0
     self.cache = {}
     self.place_type = ''
     self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx'
     self.geodb = geodb
     self.match = MatchScore.MatchScore()
     self.norm = Normalize.Normalize()
     self.place = Loc.Loc()
Ejemplo n.º 13
0
    def is_country_valid(self, place: Loc) -> bool:
        """
        See if COUNTRY is present and is in the supported country list   

        #Args:   
            place:  

        #Returns:   
            True if country is valid   
        """
        if place.country_iso == '':
            place.result_type = GeoUtil.Result.NO_COUNTRY
            is_valid = False
        elif place.country_iso not in self.geo_build.supported_countries_dct:
            self.logger.debug(f'Country [{place.country_iso}] not supported')
            place.result_type = GeoUtil.Result.NOT_SUPPORTED
            place.place_type = Loc.PlaceType.COUNTRY
            is_valid = False
        else:
            is_valid = True

        return is_valid
Ejemplo n.º 14
0
    def lookup_place(self, place: Loc) -> []:
        """
            **Lookup a place in geoname.org db**     
            Lookup is based on place.place_type as follows:  
                Loc.PlaceType.COUNTRY: does self.search_country(place)  
                Loc.PlaceType.ADVANCED_SEARCH: does self.feature_search(place)  
                Otherwise: do self.search_city(place)  
        # Args:   
            place: Loc instance.  Call Loc.parse_place() before calling lookup_place()   

        # Returns:   
            Best score found  
            place.georow_list contains a list of matching entries.  
            Each entry has: Lat, Long, districtID (County or State or Province ID), and a match quality score  

        """
        place.result_type = Result.STRONG_MATCH
        best_score = MatchScore.Score.VERY_POOR

        if place.place_type == Loc.PlaceType.COUNTRY:
            # Country
            if place.georow_list:
                place.country_name = self.get_country_name(place.country_name)
                best_score = MatchScore.Score.VERY_GOOD
        else:
            # General search
            if place.place_type == Loc.PlaceType.ADMIN1:
                place.feature = "ADM1"
            place.georow_list.clear()
            best_score = self._search(place=place,
                                      georow_list=place.georow_list,
                                      name=place.city,
                                      admin1_id=place.admin1_id,
                                      admin2_id=place.admin2_id,
                                      iso=place.country_iso,
                                      feature=place.feature,
                                      sdx=get_soundex(place.city))
        self.logger.debug(f'**LOOKUP PLACE  score={best_score}')
        return best_score
Ejemplo n.º 15
0
    def add_alias_to_db(self, ky: str, geo_build: GeodataBuild):
        alias_row = alias_list.get(ky)
        place = Loc.Loc()
        place.country_iso = alias_row[ALIAS_ISO].lower()
        place.city = alias_row[ALIAS_NAME]
        place.feature = alias_row[ALIAS_FEAT]
        place.place_type = Loc.PlaceType.CITY

        # Lookup main entry and get GEOID
        geo_build.geodb.s.lookup_place(place)
        if len(place.georow_list) > 0:
            if len(place.georow_list[0]) > 0:
                geo_row = list(place.georow_list[0][0:GeoUtil.Entry.SDX + 1])
                geo_build.update_geo_row_name(geo_row=geo_row, name=ky)
                geo_tuple = tuple(geo_row)
                geo_build.insert(geo_tuple=geo_tuple,
                                 feat_code=alias_row[ALIAS_FEAT])
Ejemplo n.º 16
0
 def __init__(self, directory: str, filename: str, progress_bar, prefix,
              geo_build: GeodataBuild, lang_list):
     """
         Read in geonames alternate names file and add to geodata database in alt_names table
     # Args:
         directory: base directory for alternate names file
         filename: filename of geonames alternate_namesV2.txt file
         progress_bar: tkhelper progress bar or None
         geo_files: GeodataFiles instance
         lang_list: List of ISO languages we want to support, e.g. ['fr', 'es']
     """
     super().__init__(directory, filename, progress_bar, prefix=prefix)
     self.sub_dir = GeoUtil.get_cache_directory(directory)
     self.geo_build: GeodataBuild.GeodataBuild = geo_build
     self.lang_list = lang_list
     self.place = Loc.Loc()
     self.search = None
Ejemplo n.º 17
0
    def create_enclosures(self):
        """
        Walk through all entries and create any missing enclosure items
        """
        self.logger.debug('\n\n******** DONE \n  CREATE CSV ENCLOSURES *********')
        place = Loc.Loc()

        # Create any missing enclosure records
        # There are separate dictionaries for each tier (prefix, city, county, country).
        # We need to go through prefix dict, then city dict, etc (starting at bottom tier)
        for idx, dictionary in reversed(list(enumerate(self.hierarchy_dictionaries))):
            self.logger.debug(f'===TABLE {idx}===')
            for key in dictionary:
                _retrieve_csv_place(self.hierarchy_dictionaries, self.geodata, place, key, idx)
                self.logger.debug(f'** CSV {key} {place.original_entry}')

                # Create enclosure for each node at this level
                self._create_enclosed_by(place)
Ejemplo n.º 18
0
    def load_handler(self):
        # Load in global replace list and display
        self.clear_display_list(self.tree)
        place = Loc.Loc()
        self.edit_entry.text = "Loading Replacement Dictionary"

        for key in sorted(self.dict):
            # Key is the original name.  Value is @GEOID@PREFIX
            # replacement = self.dict[key]
            prefix, geoid = ReplacementDictionary.parse_replacement_entry(self.dict[key])
            if geoid == '':
                self.logger.warning(f'blank item=[{key}] ')
                continue
            place.target = geoid
            # Lookup GEOID to get location info
            self.geodb.get_geoid(place=place)

            if len(place.georow_list) > 0:
                # Found it.  Copy geo row to Place
                self.geodb.copy_georow_to_place(row=place.georow_list[0], place=place)
            else:
                if len(place.target) == 0:
                    place.clear()
                    place.city1 = f'<DELETE>'
                else:
                    place.clear()
                    place.city1 = f'Database error for GeoID {geoid}'
                place.place_type = Loc.PlaceType.CITY

            # Get prefix if there was one
            place.prefix = prefix

            place.set_place_type()
            nm = place.get_long_name(self.output_replace_dct)
            if len(place.prefix) > 0:
                line = f'[{place.prefix}]{place.prefix_commas}{nm}'
            else:
                line = f'{nm}'

            self.list_append(self.tree, key, line)

        self.edit_entry.text = ""
Ejemplo n.º 19
0
    def lookup_place(self, location_name):
        # Create Location instance.  This will hold search parameters and result
        place: Loc.Loc = Loc.Loc()

        # Find best match
        match = self.geodata.find_best_match(location=location_name,
                                             place=place)

        if match:
            # Create full name for result
            nm = f'{place.get_display_name(None)}'
            print(
                f'   Best match for {location_name}:\n {nm}  Prefix=[{place.prefix}{place.prefix_commas}] Score= {place.score:.1f}\n'
            )
        else:
            if place.result_type == Geodata.GeoUtil.Result.NOT_SUPPORTED:
                print(
                    f'   NO match for {location_name}:\n Country NOT SUPPORTED: {place.country_name} \n'
                )
            else:
                print(f'   NO match for {location_name}:\n')
Ejemplo n.º 20
0
    def find_matches(self, location: str, place: Loc):
        """
            Find a location in the geoname database.  On successful match, place.georow_list will contain   
            a list of georows that matched the name.  Each georow can be copied to a Loc structure by   
            calling process_result   

        #Args:   
            location: comma separated name of location to find, e.g. 'Los Angeles, California, USA'   
            place: Loc structure   
            plain_search: If True then don't do wildcard searches   
        #Returns:   
            GeoUtil.Result code   
        """
        place.parse_place(place_name=location, geo_db=self.geo_build.geodb)
        best_score = 9999

        self.is_country_valid(place)
        if place.result_type == GeoUtil.Result.NOT_SUPPORTED:
            place.georow_list.clear()
            return best_score

        # Create full entry text
        place.update_names(self.geo_build.output_replace_dct)

        flags = ResultFlags(limited=False, filtered=False)
        result_list = [
        ]  # We will do different search types and append all results into result_list

        # self.logger.debug(f'== FIND LOCATION City=[{place.city}] Adm2=[{place.admin2_name}]\
        # Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}]  Type={place.place_type} ')

        # Save a shallow copy of place so we can restore fields
        self.save_place = copy.copy(place)

        # After parsing, last token is either country or underscore.
        # Second to last is either Admin1 or underscore
        # If >2 tokens:  token[0] is placed in City and in Prefix
        # If >3 tokens:  token[1] is placed in Admin2 and appended to Prefix

        # 1) Try lookup based on standard parsing: lookup city, county, state/province, or country as parsed
        self.logger.debug(
            f'  1) Standard, based on parsing.  pref [{place.prefix}] city [{place.city}]'
            f' sdx={GeoSearch.get_soundex(place.city)} '
            f'feat={place.feature} typ=[{place.place_type}]')

        if place.place_type != Loc.PlaceType.COUNTRY and place.place_type != Loc.PlaceType.ADMIN1 \
                and place.place_type != Loc.PlaceType.ADMIN1:
            self.logger.debug('find std place  - not ADM*  ')
            best_score = self.geo_build.geodb.s.lookup_place(place=place)
            self.logger.debug(f'std: best={best_score}')

            if place.georow_list:
                result_list.extend(place.georow_list)
            # self.logger.debug(result_list)

            if best_score >= MatchScore.Score.POOR_CUTOFF:
                # No good matches found.  Try a deep search on soundex of combinations of terms
                self.logger.debug('--- DEEP SEARCH city ---')
                best_score = self.geo_build.geodb.s.deep_lookup(place=place)
                # self.logger.debug(place.georow_list)
                if place.georow_list:
                    result_list.extend(place.georow_list)

            # Restore fields
            self._restore_fields(place, self.save_place)

            # 2) Try second token (Admin2) as a city
            if place.admin2_name != '':
                self.logger.debug(f'try 2nd token as city')
                place.georow_list.clear()
                best_score = self._find_type_as_city(place,
                                                     Loc.PlaceType.ADMIN2)
                self.logger.debug(f'2nd token best={best_score}')

                if place.georow_list:
                    result_list.extend(place.georow_list)
                    # self.logger.debug(result_list)

                    # See if we found any good scoring matches
                    if best_score >= MatchScore.Score.POOR_CUTOFF:
                        # No good matches found.  Try a deep search on soundex of combinations of terms
                        self.logger.debug('--- DEEP SEARCH city ---')
                        best_score = self.geo_build.geodb.s.deep_lookup(
                            place=place)
                        # self.logger.debug(place.georow_list)
                        if place.georow_list:
                            result_list.extend(place.georow_list)

                self._restore_fields(place, self.save_place)

            #  Move result_list into place georow list
            place.georow_list.clear()
            place.georow_list.extend(result_list)
            # self.logger.debug(place.georow_list)
        else:
            self.logger.debug('not country, adm1, adm2')
            return place.result_type

        if len(place.georow_list) > 0:
            best_score = self.geo_build.geodb._assign_scores(place.georow_list,
                                                             place,
                                                             '',
                                                             fast=False,
                                                             quiet=True)

            # self.logger.debug('process results')
            self.process_results(place=place, flags=flags)
            flags = self.filter_results(place)
        # self.logger.debug(place.georow_list)

        if len(place.georow_list) == 0:
            # NO MATCH
            if place.result_type != GeoUtil.Result.NO_COUNTRY and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
                place.result_type = GeoUtil.Result.NO_MATCH
                self.logger.debug(f'Not found.')
                #place.result_type = GeoUtil.Result.STRONG_MATCH
            else:
                self.logger.debug('Found country')
        elif len(place.georow_list) > 1:
            self.logger.debug(f'Success!  {len(place.georow_list)} matches')
            place.result_type = GeoUtil.Result.MULTIPLE_MATCHES

        # Process the results
        self.process_results(place=place, flags=flags)
        # self.logger.debug(f'Status={place.status}')
        return place.result_type
Ejemplo n.º 21
0
    def filter_results(self, place: Loc):
        """
            Sort place.georow_list by match score and eliminate duplicates   
        
        In case of duplicate, keep the one with best match score.   
        See MatchScore.match_score() for details on score calculation    
        Discard names that didnt exist at time of event (update result flag if this occurs)  
        Duplicates are defined as two items with:  
        1) same GEOID or 2) same name and similar lat/lon (within Rectilinear Distance of distance_cutoff degrees)  
        
        Add flag if we hit the lookup limit  
        #Args:   
            place:   
        
        #Returns:   
            ResultFlags(limited=limited_flag, filtered=date_filtered)   
        """

        date_filtered = False  # Flag to indicate whether we dropped locations due to event date
        # event_year = place.event_year

        if len(place.georow_list) > 100:
            limited_flag = True
        else:
            limited_flag = False

        if len(place.georow_list) == 0:
            self.logger.debug('EMPTY')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        # Remove duplicate locations in list (have same name and lat/lon)
        self.remove_duplicates(place)
        if len(place.georow_list) == 0:
            self.logger.error(f'georow_list = 0')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        gap_threshold = 0
        score = 0

        # Sort places in match_score order
        new_list = sorted(place.georow_list,
                          key=itemgetter(GeoUtil.Entry.SCORE,
                                         GeoUtil.Entry.ADM1))
        if len(new_list) == 0:
            self.logger.error(f'new_list = 0')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        if len(new_list[0]) < GeoUtil.Entry.SCORE + 1:
            self.logger.debug(f'len = {len(new_list[0])}')
            self.logger.debug(f'[{new_list[0]}]')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        min_score = new_list[0][GeoUtil.Entry.SCORE]
        place.georow_list.clear()

        # Go through sorted list and only add items to georow_list that are close to the best score
        for rw, geo_row in enumerate(new_list):
            score = geo_row[GeoUtil.Entry.SCORE]
            # admin1_name = self.geo_build.geodb.get_admin1_name_direct(geo_row[GeoUtil.Entry.ADM1], geo_row[GeoUtil.Entry.ISO])
            # admin2_name = self.geo_build.geodb.get_admin2_name_direct(geo_row[GeoUtil.Entry.ADM1],
            #                                                          geo_row[GeoUtil.Entry.ADM2], geo_row[GeoUtil.Entry.ISO])

            base = MatchScore.Score.VERY_GOOD + (MatchScore.Score.GOOD / 3)
            gap_threshold = base + abs(min_score) * .6

            # Range to display when there is a strong match
            # if (min_score <= base and score > min_score + gap_threshold) or score > MatchScore.Score.VERY_POOR * 1.5:
            if score > min_score + gap_threshold:

                self.logger.debug(
                    f'SKIP Score={score:.1f} Min={min_score:.1f} Gap={gap_threshold:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}]'
                    f' {geo_row[GeoUtil.Entry.NAME]},'
                    f' {geo_row[GeoUtil.Entry.ADM2]},'
                    f' {geo_row[GeoUtil.Entry.ADM1]} ')
            else:
                place.georow_list.append(geo_row)
                self.logger.debug(
                    f'Score {score:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}] {geo_row[GeoUtil.Entry.NAME]}, '
                    f'AD2={geo_row[GeoUtil.Entry.ADM2]},'
                    f' AD1={geo_row[GeoUtil.Entry.ADM1]} {geo_row[GeoUtil.Entry.ISO]}'
                )

        # self.logger.debug(f'min={min_score:.1f}, gap2={gap_threshold:.1f} strong cutoff={min_score + gap_threshold:.1f}')

        if min_score <= MatchScore.Score.VERY_GOOD and len(
                place.georow_list
        ) == 1 and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
            place.result_type = GeoUtil.Result.STRONG_MATCH
        else:
            # Log item that we couldnt match
            if self.miss_diag_file:
                self.miss_diag_file.write(
                    f'Lookup {place.original_entry} thresh={gap_threshold} gap={score - min_score}\n\n'
                )

        return ResultFlags(limited=limited_flag, filtered=date_filtered)
Ejemplo n.º 22
0
    def match_score(self,
                    target_place: Loc,
                    result_place: Loc,
                    fast=False) -> float:
        """
            Calculate a heuristic score for how well a result place name matches a target place name.  The score is based on
            percent of characters that didnt match in input and output (plus other items described below).
            Mismatch score is 0-100% reflecting the percent mismatch between the user input and the result.  This is then
            adjusted by Feature type (large city gives best score) plus other items to give a final heuristic where
            -10 is perfect match of a large city and 100 is no match.

            A) Heuristic:
            1) Create 5 part title (prefix, city, county, state/province, country)
            2) Normalize text - self.norm.normalize_for_scoring()
            3) Remove sequences of 2 chars or more that match in target and result
            4) Calculate inscore - percent of characters in input that didn't match result.  Weight by term (city,,county,state,ctry)
                    Exact match of city term gets a bonus
            5) Calculate result score - percent of characters in db result that didn't match input

            B) Score components (All are weighted in final score):   
            in_score - (0-100) - score for input that didnt match output   
            feature_score - (0-100)  More important features get lower score.   
            City with 1M population is zero.  Valley is 100.  Geodata.feature_priority().  
            wildcard_penalty - score is raised by X if it includes a wildcard   
            prefix_penalty -  score is raised by length of Prefix   

            C) A standard text difference, such as Levenstein, was not used because those treat both strings as equal,   
            whereas this treats the User text as more important than DB result text and also weights each token.  A user's   
            text might commonly be something like: Paris, France and a DB result of Paris, Paris, Ile De France, France.   
            The Levenstein distance would be large, but with this heuristic, the middle terms can have lower weights, and   
            having all the input matched can be weighted higher than mismatches on the county and province.  This heuristic gives   
            a score of -9 for Paris, France.   

        # Args:
            target_place:  Loc  with users entry.
            result_place:  Loc with DB result.
        # Returns:
            score
        """
        if fast:
            return self.fast_score(target_place, result_place)

        self.score_diags = ''  # Diagnostic text for scoring
        self.timing = 0
        save_prefix = target_place.prefix
        #self.logger.debug(f'pref={target_place.prefix}')

        # Remove items in prefix that are in result
        if target_place.place_type != Loc.PlaceType.ADVANCED_SEARCH:
            target_place.prefix = self.norm.normalize_for_scoring(
                target_place.prefix)
            result_name = result_place.get_long_name(None)
            target_place.prefix = Loc.Loc.fast_prefix(target_place.prefix,
                                                      result_name)
        else:
            target_place.updated_entry = target_place.get_long_name(None)

        # Create full, normalized titles (prefix,city,county,state,country)
        result_title, result_tokens, target_title, target_tokens = self._prepare_input(
            target_place, result_place)
        #self.logger.debug(f'Res [{result_tokens}] Targ [{target_tokens}] ')

        # Calculate Prefix score.  Prefix is not used in search and longer is generally worse
        prefix_score = _calculate_prefix_penalty(target_place.prefix)

        # Calculate score for  percent of input target text that matched result
        in_score = self._calculate_weighted_score(target_tokens, result_tokens)

        # Calculate score for wildcard search - wildcard searches are missing letters and need special handling
        wildcard_score = self._calculate_wildcard_score(
            target_place.original_entry)

        # Calculate Feature score - this ensures "important" places get higher rank (large city, etc)
        feature_score = Geodata.Geodata._feature_priority(result_place.feature)

        # Weight and add up scores - Each item is 0-100 and then weighted, except wildcard penalty
        score: float = in_score * self.input_weight + feature_score * self.feature_weight + \
                       prefix_score * self.prefix_weight + wildcard_score

        self.logger.debug(
            f'SCORE {score:.1f} res=[{result_title}] pref=[{target_place.prefix}]'
            f'inSc={in_score * self.input_weight:.1f}% feat={feature_score * self.feature_weight:.1f} {result_place.feature}  '
            f'wild={wildcard_score} pref={prefix_score * self.prefix_weight:.1f}'
        )

        self.logger.debug(self.score_diags)
        target_place.prefix = save_prefix

        return score + 8
Ejemplo n.º 23
0
    def __init__(self,
                 directory: str,
                 display_progress,
                 show_message: bool,
                 exit_on_error: bool,
                 languages_list_dct: {},
                 feature_code_list_dct: {},
                 supported_countries_dct: {},
                 volume=''):
        """
        Read in datafiles needed for geodata, filter them and create a sql db.
        Filter dictionary examples:   
            languages_list_dct={'fr','de'}
            feature_code_list_dct={'PPL', 'ADM1', 'CSTL'}
            supported_countries_dct = {'us','gb','at'}
        # Args:
            directory: base directory
            display_progress: None or Handler called with percent_done:int, msg:str
            show_message: True to show message boxes to user on errors
            exit_on_error:  True to exit on serious errors
            languages_list_dct: dictionary containing the ISO-2 languages  to load from alternateNames
            feature_code_list_dct: dictionary containing the Geonames.org feature codes to load
            supported_countries_dct: dictionary containing the ISO-2 countries to load
            volume: disk volume to use - e.g. C: for Windows or /Volumes/xyz for OSX, /media/xyz for linux
        """
        self.logger = logging.getLogger(__name__)
        self.geodb: [GeoDB.GeoDB, None] = None
        self.show_message = show_message
        self.geoid_main_dict = {}  # Key is GEOID, Value is DB ID for entry
        self.geoid_admin_dict = {}  # Key is GEOID, Value is DB ID for entry
        # TODO fix volume handling
        self.volume = volume
        self.collate = 'COLLATE NOCASE'

        self.exit_on_error = exit_on_error
        self.required_db_version = 4
        # Message to user upgrading from earlier DB version
        self.db_upgrade_text = 'Renamed column to Feature'
        self.directory: str = directory
        self.progress_bar = display_progress
        self.line_num = 0
        self.cache_changed: bool = False
        sub_dir = GeoUtil.get_cache_directory(self.directory)
        self.country = None
        self.languages_list_dct = languages_list_dct
        self.feature_code_list_dct = feature_code_list_dct
        self.supported_countries_dct = supported_countries_dct
        self.lang_list = []
        self.norm = Normalize.Normalize()

        for item in self.languages_list_dct:
            self.lang_list.append(item)

        if volume != '':
            os.chdir(volume)
        if not os.path.exists(sub_dir):
            self.logger.warning(f'Directory] {sub_dir} NOT FOUND')
            if self.show_message:
                messagebox.showwarning(
                    'Folder not found',
                    f'Directory\n\n {sub_dir}\n\n NOT FOUND')
            if exit_on_error:
                sys.exit()

        # Read in Text Replacement dictionary pickle - this has output text replacements
        self.output_replace_cd = CachedDictionary.CachedDictionary(
            sub_dir, "output_list.pkl")
        self.output_replace_cd.read()
        self.output_replace_dct: Dict[str, str] = self.output_replace_cd.dict
        self.output_replace_list = []

        for item in self.output_replace_dct:
            self.output_replace_list.append(item)

        self.entry_place = Loc.Loc()

        # Support for Geonames AlternateNames file.  Adds alternate names for entries
        self.alternate_names = AlternateNames.AlternateNames(
            directory=self.directory,
            geo_build=self,
            progress_bar=self.progress_bar,
            prefix="Step 3 of 4) ",
            filename='alternateNamesV2.txt',
            lang_list=self.lang_list)
Ejemplo n.º 24
0
    def copy_georow_to_place(self, row, place: Loc, fast: bool):
        """
        Copy data from DB row into place instance   
        Country, admin1_id, admin2_id, city, lat/lon, feature, geoid are updated if available   
        #Args:   
            row: georow from geoname database   
            place: Loc instance   
            fast: Currently ignored
        #Returns:   
            None.  Place instance is updated with data from georow   
        """
        place.admin1_id = ''
        place.admin2_id = ''
        place.admin1_name = ''
        place.admin2_name = ''
        place.city = ''

        place.country_iso = str(row[Entry.ISO])
        place.lat = row[Entry.LAT]
        place.lon = row[Entry.LON]
        place.feature = str(row[Entry.FEAT])
        place.geoid = str(row[Entry.ID])
        place.prefix = row[Entry.PREFIX]
        place.place_type = Loc.PlaceType.CITY

        if place.feature == 'ADM0':
            place.place_type = Loc.PlaceType.COUNTRY
            pass
        elif place.feature == 'ADM1':
            place.admin1_id = row[Entry.ADM1]
            place.place_type = Loc.PlaceType.ADMIN1
        elif place.feature == 'ADM2':
            place.admin1_id = row[Entry.ADM1]
            place.admin2_id = row[Entry.ADM2]
            place.place_type = Loc.PlaceType.ADMIN2
        else:
            place.admin1_id = row[Entry.ADM1]
            place.admin2_id = row[Entry.ADM2]
            place.city = row[Entry.NAME]

        self.s.update_names(place)

        if place.admin2_name is None:
            place.admin2_name = ''
        if place.admin1_name is None:
            place.admin1_name = ''

        place.city = str(place.city)
        if place.city is None:
            place.city = ''

        try:
            place.score = row[Entry.SCORE]
        except IndexError:

            pass
Ejemplo n.º 25
0
 def setUp(self) -> None:
     self.place: Loc.Loc = Loc.Loc()
Ejemplo n.º 26
0
    def _assign_scores(self,
                       georow_list,
                       place,
                       target_feature,
                       fast=False,
                       quiet=False) -> float:
        """
                    Assign match score to each result in list   
        Args:
            place: 
            target_feature: 
            fast: 
            quiet: if True, set logging to INFO

        Returns:

        """
        result_place: Loc = Loc.Loc()
        start = time.time()
        best_score = 999.9

        if place == None:
            # Add dummy match quality score  to each entry
            for idx, rw in enumerate(georow_list):
                update = list(rw)
                if len(update) < Entry.SCORE + 1:
                    update.append(1)
                    georow_list[idx] = tuple(update)
            return best_score

        original_prefix = place.prefix

        # If quiet, then only log at INFO level
        lev = logging.getLogger().getEffectiveLevel()
        #if quiet:
        #    logging.getLogger().setLevel(logging.INFO)

        # Add match quality score and prefix to each entry
        for idx, rw in enumerate(georow_list):
            place.prefix = original_prefix
            if len(rw) == 0:
                continue
            # self.logger.debug(rw)
            self.copy_georow_to_place(row=rw, place=result_place, fast=fast)
            result_place.original_entry = result_place.get_long_name(None)
            # self.logger.debug(f'plac feat=[{place.feature}] targ=[{target_feature}]')
            if result_place.feature == target_feature:
                bonus = 10.0
            else:
                bonus = 0

            if len(place.prefix) > 0 and result_place.prefix == '':
                result_place.prefix = ' '
            else:
                result_place.prefix = ''

            score = self.match.match_score(target_place=place,
                                           result_place=result_place,
                                           fast=fast) - bonus
            best_score = min(best_score, score)

            # Convert row tuple to list and extend so we can assign score
            update = list(rw)
            if len(update) < Entry.SCORE + 1:
                update.append(1)
            update[Entry.SCORE] = score

            result_place.prefix = self.norm.normalize(place.prefix, True)
            update[Entry.PREFIX] = result_place.prefix
            georow_list[idx] = tuple(update)  # Convert back from list to tuple
            # self.logger.debug(f'{update[GeoUtil.Entry.SCORE]:.1f} {update[GeoUtil.Entry.NAME]} [{update[GeoUtil.Entry.PREFIX]}]')

        # if len(georow_list) > 0:
        #    self.logger.debug(f'min={min_score} {georow_list[0]}')
        if best_score < MatchScore.Score.STRONG_CUTOFF:
            place.result_type = Result.STRONG_MATCH

        # Restore logging level
        logging.getLogger().setLevel(lev)

        elapsed = time.time() - start
        self.logger.debug(
            f'assign_scores min={best_score} elapsed={elapsed:.3f}')
        return best_score
Ejemplo n.º 27
0
 def setUp(self) -> None:
     TestScoring.in_place: Loc.Loc = Loc.Loc()
     TestScoring.out_place: Loc.Loc = Loc.Loc()