Ejemplo n.º 1
0
    def find_best_match(self, location: str, place: Loc) -> bool:
        """
            Find the best scoring match for this location in the geoname dictionary.  
        #Args:  
            location:  location name, e.g. Los Angeles, California, USA   
            place:  Loc instance   
        #Returns: True if a match was found     
            place is updated with -- lat, lon, district, city, country_iso, result code  
        """
        #  First parse the location into <prefix>, city, <district2>, district1, country.
        #  Then look it up in the place db

        res = self.find_matches(location, place)

        # Clear to just best entry
        flags = self.filter_results(place)
        # If multiple matches, truncate to first match
        if len(place.georow_list) > 0:
            place.georow_list = place.georow_list[:1]
            self.process_results(place=place, flags=flags)
            place.set_place_type()

            nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}'
            place.prefix = place.prefix_cleanup(place.prefix, nm)
            return True
        else:
            if res in GeoUtil.successful_match:
                nm = f'{place.get_long_name(self.geo_build.output_replace_dct)}'
                place.prefix = place.prefix_cleanup(place.prefix, nm)
                #print(f'Found pre=[{place.prefix}{place.prefix_commas}] Nam=[{nm}]')
                return True

            return False
Ejemplo n.º 2
0
    def copy_georow_to_place(self, row, place: Loc, fast: bool):
        """
        Copy data from DB row into place instance   
        Country, admin1_id, admin2_id, city, lat/lon, feature, geoid are updated if available   
        #Args:   
            row: georow from geoname database   
            place: Loc instance   
            fast: Currently ignored
        #Returns:   
            None.  Place instance is updated with data from georow   
        """
        place.admin1_id = ''
        place.admin2_id = ''
        place.admin1_name = ''
        place.admin2_name = ''
        place.city = ''

        place.country_iso = str(row[Entry.ISO])
        place.lat = row[Entry.LAT]
        place.lon = row[Entry.LON]
        place.feature = str(row[Entry.FEAT])
        place.geoid = str(row[Entry.ID])
        place.prefix = row[Entry.PREFIX]
        place.place_type = Loc.PlaceType.CITY

        if place.feature == 'ADM0':
            place.place_type = Loc.PlaceType.COUNTRY
            pass
        elif place.feature == 'ADM1':
            place.admin1_id = row[Entry.ADM1]
            place.place_type = Loc.PlaceType.ADMIN1
        elif place.feature == 'ADM2':
            place.admin1_id = row[Entry.ADM1]
            place.admin2_id = row[Entry.ADM2]
            place.place_type = Loc.PlaceType.ADMIN2
        else:
            place.admin1_id = row[Entry.ADM1]
            place.admin2_id = row[Entry.ADM2]
            place.city = row[Entry.NAME]

        self.s.update_names(place)

        if place.admin2_name is None:
            place.admin2_name = ''
        if place.admin1_name is None:
            place.admin1_name = ''

        place.city = str(place.city)
        if place.city is None:
            place.city = ''

        try:
            place.score = row[Entry.SCORE]
        except IndexError:

            pass
Ejemplo n.º 3
0
    def match_score(self,
                    target_place: Loc,
                    result_place: Loc,
                    fast=False) -> float:
        """
            Calculate a heuristic score for how well a result place name matches a target place name.  The score is based on
            percent of characters that didnt match in input and output (plus other items described below).
            Mismatch score is 0-100% reflecting the percent mismatch between the user input and the result.  This is then
            adjusted by Feature type (large city gives best score) plus other items to give a final heuristic where
            -10 is perfect match of a large city and 100 is no match.

            A) Heuristic:
            1) Create 5 part title (prefix, city, county, state/province, country)
            2) Normalize text - self.norm.normalize_for_scoring()
            3) Remove sequences of 2 chars or more that match in target and result
            4) Calculate inscore - percent of characters in input that didn't match result.  Weight by term (city,,county,state,ctry)
                    Exact match of city term gets a bonus
            5) Calculate result score - percent of characters in db result that didn't match input

            B) Score components (All are weighted in final score):   
            in_score - (0-100) - score for input that didnt match output   
            feature_score - (0-100)  More important features get lower score.   
            City with 1M population is zero.  Valley is 100.  Geodata.feature_priority().  
            wildcard_penalty - score is raised by X if it includes a wildcard   
            prefix_penalty -  score is raised by length of Prefix   

            C) A standard text difference, such as Levenstein, was not used because those treat both strings as equal,   
            whereas this treats the User text as more important than DB result text and also weights each token.  A user's   
            text might commonly be something like: Paris, France and a DB result of Paris, Paris, Ile De France, France.   
            The Levenstein distance would be large, but with this heuristic, the middle terms can have lower weights, and   
            having all the input matched can be weighted higher than mismatches on the county and province.  This heuristic gives   
            a score of -9 for Paris, France.   

        # Args:
            target_place:  Loc  with users entry.
            result_place:  Loc with DB result.
        # Returns:
            score
        """
        if fast:
            return self.fast_score(target_place, result_place)

        self.score_diags = ''  # Diagnostic text for scoring
        self.timing = 0
        save_prefix = target_place.prefix
        #self.logger.debug(f'pref={target_place.prefix}')

        # Remove items in prefix that are in result
        if target_place.place_type != Loc.PlaceType.ADVANCED_SEARCH:
            target_place.prefix = self.norm.normalize_for_scoring(
                target_place.prefix)
            result_name = result_place.get_long_name(None)
            target_place.prefix = Loc.Loc.fast_prefix(target_place.prefix,
                                                      result_name)
        else:
            target_place.updated_entry = target_place.get_long_name(None)

        # Create full, normalized titles (prefix,city,county,state,country)
        result_title, result_tokens, target_title, target_tokens = self._prepare_input(
            target_place, result_place)
        #self.logger.debug(f'Res [{result_tokens}] Targ [{target_tokens}] ')

        # Calculate Prefix score.  Prefix is not used in search and longer is generally worse
        prefix_score = _calculate_prefix_penalty(target_place.prefix)

        # Calculate score for  percent of input target text that matched result
        in_score = self._calculate_weighted_score(target_tokens, result_tokens)

        # Calculate score for wildcard search - wildcard searches are missing letters and need special handling
        wildcard_score = self._calculate_wildcard_score(
            target_place.original_entry)

        # Calculate Feature score - this ensures "important" places get higher rank (large city, etc)
        feature_score = Geodata.Geodata._feature_priority(result_place.feature)

        # Weight and add up scores - Each item is 0-100 and then weighted, except wildcard penalty
        score: float = in_score * self.input_weight + feature_score * self.feature_weight + \
                       prefix_score * self.prefix_weight + wildcard_score

        self.logger.debug(
            f'SCORE {score:.1f} res=[{result_title}] pref=[{target_place.prefix}]'
            f'inSc={in_score * self.input_weight:.1f}% feat={feature_score * self.feature_weight:.1f} {result_place.feature}  '
            f'wild={wildcard_score} pref={prefix_score * self.prefix_weight:.1f}'
        )

        self.logger.debug(self.score_diags)
        target_place.prefix = save_prefix

        return score + 8