Ejemplo n.º 1
0
    def find_geoid(self, geoid: str, place: Loc) -> None:
        """
        Lookup by geoid   
        #Args:   
            geoid:  Geonames.org geoid
            place:  Location fields in place are updated

        #Returns: None. Location fields in place are updated

        """
        flags = ResultFlags(limited=False, filtered=False)
        place.geoid = geoid
        place.georow_list.clear()
        self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list,
                                            geoid=place.geoid,
                                            place=place)
        if len(place.georow_list) == 0:
            self.geo_build.geodb.s.lookup_geoid(georow_list=place.georow_list,
                                                geoid=place.geoid,
                                                place=place,
                                                admin=True)

        if len(place.georow_list) > 0:
            place.result_type = GeoUtil.Result.STRONG_MATCH
            self.process_results(place=place, flags=flags)
            # self.logger.debug(f'found geoid {place.georow_list[0]}')
        else:
            place.result_type = GeoUtil.Result.NO_MATCH
Ejemplo n.º 2
0
    def process_results(self, place: Loc, flags) -> None:
        """
            Update fields in place record using first entry in place.georow_list   
            Updates fields with available data: city, admin1, admin2, country, lat/long, feature, etc.   
        #Args:    
            place: Loc instance   
            flags: Flags tuple as returned by sort_results   

        #Returns:    
            None.  place instance fields are updated   
        """
        # self.logger.debug(f'**PROCESS RESULT:  Res={place.result_type}   Georow_list={place.georow_list}')
        if place.result_type == GeoUtil.Result.NOT_SUPPORTED:
            place.place_type = Loc.PlaceType.COUNTRY

        if place.result_type in GeoUtil.successful_match and len(
                place.georow_list) > 0:
            self.geo_build.geodb.copy_georow_to_place(row=place.georow_list[0],
                                                      place=place,
                                                      fast=False)
        elif len(place.georow_list
                 ) > 0 and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
            # self.logger.debug(f'***RESULT={place.result_type} Setting to Partial')
            place.result_type = GeoUtil.Result.PARTIAL_MATCH

        place.set_place_type_text()
Ejemplo n.º 3
0
    def is_country_valid(self, place: Loc) -> bool:
        """
        See if COUNTRY is present and is in the supported country list   

        #Args:   
            place:  

        #Returns:   
            True if country is valid   
        """
        if place.country_iso == '':
            place.result_type = GeoUtil.Result.NO_COUNTRY
            is_valid = False
        elif place.country_iso not in self.geo_build.supported_countries_dct:
            self.logger.debug(f'Country [{place.country_iso}] not supported')
            place.result_type = GeoUtil.Result.NOT_SUPPORTED
            place.place_type = Loc.PlaceType.COUNTRY
            is_valid = False
        else:
            is_valid = True

        return is_valid
Ejemplo n.º 4
0
    def lookup_place(self, place: Loc) -> []:
        """
            **Lookup a place in geoname.org db**     
            Lookup is based on place.place_type as follows:  
                Loc.PlaceType.COUNTRY: does self.search_country(place)  
                Loc.PlaceType.ADVANCED_SEARCH: does self.feature_search(place)  
                Otherwise: do self.search_city(place)  
        # Args:   
            place: Loc instance.  Call Loc.parse_place() before calling lookup_place()   

        # Returns:   
            Best score found  
            place.georow_list contains a list of matching entries.  
            Each entry has: Lat, Long, districtID (County or State or Province ID), and a match quality score  

        """
        place.result_type = Result.STRONG_MATCH
        best_score = MatchScore.Score.VERY_POOR

        if place.place_type == Loc.PlaceType.COUNTRY:
            # Country
            if place.georow_list:
                place.country_name = self.get_country_name(place.country_name)
                best_score = MatchScore.Score.VERY_GOOD
        else:
            # General search
            if place.place_type == Loc.PlaceType.ADMIN1:
                place.feature = "ADM1"
            place.georow_list.clear()
            best_score = self._search(place=place,
                                      georow_list=place.georow_list,
                                      name=place.city,
                                      admin1_id=place.admin1_id,
                                      admin2_id=place.admin2_id,
                                      iso=place.country_iso,
                                      feature=place.feature,
                                      sdx=get_soundex(place.city))
        self.logger.debug(f'**LOOKUP PLACE  score={best_score}')
        return best_score
Ejemplo n.º 5
0
    def find_matches(self, location: str, place: Loc):
        """
            Find a location in the geoname database.  On successful match, place.georow_list will contain   
            a list of georows that matched the name.  Each georow can be copied to a Loc structure by   
            calling process_result   

        #Args:   
            location: comma separated name of location to find, e.g. 'Los Angeles, California, USA'   
            place: Loc structure   
            plain_search: If True then don't do wildcard searches   
        #Returns:   
            GeoUtil.Result code   
        """
        place.parse_place(place_name=location, geo_db=self.geo_build.geodb)
        best_score = 9999

        self.is_country_valid(place)
        if place.result_type == GeoUtil.Result.NOT_SUPPORTED:
            place.georow_list.clear()
            return best_score

        # Create full entry text
        place.update_names(self.geo_build.output_replace_dct)

        flags = ResultFlags(limited=False, filtered=False)
        result_list = [
        ]  # We will do different search types and append all results into result_list

        # self.logger.debug(f'== FIND LOCATION City=[{place.city}] Adm2=[{place.admin2_name}]\
        # Adm1=[{place.admin1_name}] Pref=[{place.prefix}] Cntry=[{place.country_name}] iso=[{place.country_iso}]  Type={place.place_type} ')

        # Save a shallow copy of place so we can restore fields
        self.save_place = copy.copy(place)

        # After parsing, last token is either country or underscore.
        # Second to last is either Admin1 or underscore
        # If >2 tokens:  token[0] is placed in City and in Prefix
        # If >3 tokens:  token[1] is placed in Admin2 and appended to Prefix

        # 1) Try lookup based on standard parsing: lookup city, county, state/province, or country as parsed
        self.logger.debug(
            f'  1) Standard, based on parsing.  pref [{place.prefix}] city [{place.city}]'
            f' sdx={GeoSearch.get_soundex(place.city)} '
            f'feat={place.feature} typ=[{place.place_type}]')

        if place.place_type != Loc.PlaceType.COUNTRY and place.place_type != Loc.PlaceType.ADMIN1 \
                and place.place_type != Loc.PlaceType.ADMIN1:
            self.logger.debug('find std place  - not ADM*  ')
            best_score = self.geo_build.geodb.s.lookup_place(place=place)
            self.logger.debug(f'std: best={best_score}')

            if place.georow_list:
                result_list.extend(place.georow_list)
            # self.logger.debug(result_list)

            if best_score >= MatchScore.Score.POOR_CUTOFF:
                # No good matches found.  Try a deep search on soundex of combinations of terms
                self.logger.debug('--- DEEP SEARCH city ---')
                best_score = self.geo_build.geodb.s.deep_lookup(place=place)
                # self.logger.debug(place.georow_list)
                if place.georow_list:
                    result_list.extend(place.georow_list)

            # Restore fields
            self._restore_fields(place, self.save_place)

            # 2) Try second token (Admin2) as a city
            if place.admin2_name != '':
                self.logger.debug(f'try 2nd token as city')
                place.georow_list.clear()
                best_score = self._find_type_as_city(place,
                                                     Loc.PlaceType.ADMIN2)
                self.logger.debug(f'2nd token best={best_score}')

                if place.georow_list:
                    result_list.extend(place.georow_list)
                    # self.logger.debug(result_list)

                    # See if we found any good scoring matches
                    if best_score >= MatchScore.Score.POOR_CUTOFF:
                        # No good matches found.  Try a deep search on soundex of combinations of terms
                        self.logger.debug('--- DEEP SEARCH city ---')
                        best_score = self.geo_build.geodb.s.deep_lookup(
                            place=place)
                        # self.logger.debug(place.georow_list)
                        if place.georow_list:
                            result_list.extend(place.georow_list)

                self._restore_fields(place, self.save_place)

            #  Move result_list into place georow list
            place.georow_list.clear()
            place.georow_list.extend(result_list)
            # self.logger.debug(place.georow_list)
        else:
            self.logger.debug('not country, adm1, adm2')
            return place.result_type

        if len(place.georow_list) > 0:
            best_score = self.geo_build.geodb._assign_scores(place.georow_list,
                                                             place,
                                                             '',
                                                             fast=False,
                                                             quiet=True)

            # self.logger.debug('process results')
            self.process_results(place=place, flags=flags)
            flags = self.filter_results(place)
        # self.logger.debug(place.georow_list)

        if len(place.georow_list) == 0:
            # NO MATCH
            if place.result_type != GeoUtil.Result.NO_COUNTRY and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
                place.result_type = GeoUtil.Result.NO_MATCH
                self.logger.debug(f'Not found.')
                #place.result_type = GeoUtil.Result.STRONG_MATCH
            else:
                self.logger.debug('Found country')
        elif len(place.georow_list) > 1:
            self.logger.debug(f'Success!  {len(place.georow_list)} matches')
            place.result_type = GeoUtil.Result.MULTIPLE_MATCHES

        # Process the results
        self.process_results(place=place, flags=flags)
        # self.logger.debug(f'Status={place.status}')
        return place.result_type
Ejemplo n.º 6
0
    def filter_results(self, place: Loc):
        """
            Sort place.georow_list by match score and eliminate duplicates   
        
        In case of duplicate, keep the one with best match score.   
        See MatchScore.match_score() for details on score calculation    
        Discard names that didnt exist at time of event (update result flag if this occurs)  
        Duplicates are defined as two items with:  
        1) same GEOID or 2) same name and similar lat/lon (within Rectilinear Distance of distance_cutoff degrees)  
        
        Add flag if we hit the lookup limit  
        #Args:   
            place:   
        
        #Returns:   
            ResultFlags(limited=limited_flag, filtered=date_filtered)   
        """

        date_filtered = False  # Flag to indicate whether we dropped locations due to event date
        # event_year = place.event_year

        if len(place.georow_list) > 100:
            limited_flag = True
        else:
            limited_flag = False

        if len(place.georow_list) == 0:
            self.logger.debug('EMPTY')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        # Remove duplicate locations in list (have same name and lat/lon)
        self.remove_duplicates(place)
        if len(place.georow_list) == 0:
            self.logger.error(f'georow_list = 0')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        gap_threshold = 0
        score = 0

        # Sort places in match_score order
        new_list = sorted(place.georow_list,
                          key=itemgetter(GeoUtil.Entry.SCORE,
                                         GeoUtil.Entry.ADM1))
        if len(new_list) == 0:
            self.logger.error(f'new_list = 0')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        if len(new_list[0]) < GeoUtil.Entry.SCORE + 1:
            self.logger.debug(f'len = {len(new_list[0])}')
            self.logger.debug(f'[{new_list[0]}]')
            return ResultFlags(limited=limited_flag, filtered=date_filtered)

        min_score = new_list[0][GeoUtil.Entry.SCORE]
        place.georow_list.clear()

        # Go through sorted list and only add items to georow_list that are close to the best score
        for rw, geo_row in enumerate(new_list):
            score = geo_row[GeoUtil.Entry.SCORE]
            # admin1_name = self.geo_build.geodb.get_admin1_name_direct(geo_row[GeoUtil.Entry.ADM1], geo_row[GeoUtil.Entry.ISO])
            # admin2_name = self.geo_build.geodb.get_admin2_name_direct(geo_row[GeoUtil.Entry.ADM1],
            #                                                          geo_row[GeoUtil.Entry.ADM2], geo_row[GeoUtil.Entry.ISO])

            base = MatchScore.Score.VERY_GOOD + (MatchScore.Score.GOOD / 3)
            gap_threshold = base + abs(min_score) * .6

            # Range to display when there is a strong match
            # if (min_score <= base and score > min_score + gap_threshold) or score > MatchScore.Score.VERY_POOR * 1.5:
            if score > min_score + gap_threshold:

                self.logger.debug(
                    f'SKIP Score={score:.1f} Min={min_score:.1f} Gap={gap_threshold:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}]'
                    f' {geo_row[GeoUtil.Entry.NAME]},'
                    f' {geo_row[GeoUtil.Entry.ADM2]},'
                    f' {geo_row[GeoUtil.Entry.ADM1]} ')
            else:
                place.georow_list.append(geo_row)
                self.logger.debug(
                    f'Score {score:.1f} [{geo_row[GeoUtil.Entry.PREFIX]}] {geo_row[GeoUtil.Entry.NAME]}, '
                    f'AD2={geo_row[GeoUtil.Entry.ADM2]},'
                    f' AD1={geo_row[GeoUtil.Entry.ADM1]} {geo_row[GeoUtil.Entry.ISO]}'
                )

        # self.logger.debug(f'min={min_score:.1f}, gap2={gap_threshold:.1f} strong cutoff={min_score + gap_threshold:.1f}')

        if min_score <= MatchScore.Score.VERY_GOOD and len(
                place.georow_list
        ) == 1 and place.result_type != GeoUtil.Result.NOT_SUPPORTED:
            place.result_type = GeoUtil.Result.STRONG_MATCH
        else:
            # Log item that we couldnt match
            if self.miss_diag_file:
                self.miss_diag_file.write(
                    f'Lookup {place.original_entry} thresh={gap_threshold} gap={score - min_score}\n\n'
                )

        return ResultFlags(limited=limited_flag, filtered=date_filtered)