Ejemplo n.º 1
0
def _calculate_prefix_penalty(prefix):
    # If the location has a prefix, it is not as good a match
    prefix_len = len(prefix)
    if prefix_len > 0:
        # reduce penalty if prefix is a street (contains digits or 'street' or 'road')
        penalty = 5 + prefix_len
        if GeoUtil.is_street(prefix):
            penalty *= 0.1
    else:
        penalty = 0

    return penalty
Ejemplo n.º 2
0
    def parse_place(self, place_name: str, geo_db: GeoDB.GeoDB):
        """
            Given a comma separated place name,   
            parse into its city, admin1, country and type of entity (city, country etc)   
        #Args:   
            place_name: The place name to parse   
            geo_files: GeodataBuild instance   
        #Returns:   
            Fields in Loc (city, adm1, adm2, iso) are updated based on parsing. self.status has Result status code   
        """
        self.geo_db = geo_db
        self.logger.debug(f'PARSE {place_name}\n')
        self.clear()
        self.original_entry = place_name

        # Convert open-brace and open-paren to comma.  close brace/paren will be stripped by normalize()
        name = re.sub(r'\[', ',', place_name)
        name = re.sub(r'\(', ',', name)

        tokens = name.split(",")
        if len(tokens[-1]) == 0:
            # Last item is blank, so remove it
            tokens = tokens[:-1]

        token_count = len(tokens)
        self.place_type = PlaceType.CITY

        # First, try to parse and validate State/Province, and Country from last two tokens
        # If one other token, parse as city
        # If two other tokens, parse as city, admin2
        # First two tokens are also copied to prefix.
        # Place type is the leftmost item we found - either City, Admin2, Admin2, or Country
        # If '--' in name, then extract advanced search options

        if '--' in place_name:
            # Advanced Search - Pull out filter flags if present
            self.logger.debug('filter')
            self.get_filter_parameters(place_name)
            return

        if token_count > 0:
            #  COUNTRY - right-most token should be country
            self.country_name = self.norm.normalize(tokens[-1], False)

            # Validate country
            self.country_iso = geo_db.s.get_country_iso(
                self.country_name)  # Get Country country_iso
            self.logger.debug(
                f'1) Lookup COUNTRY [{self.country_name}] Found ISO [{self.country_iso}] *******'
            )

            if self.country_iso != '':
                self.place_type = PlaceType.COUNTRY
                self.result_type = GeoUtil.Result.PARTIAL_MATCH
            else:
                # Last token is not COUNTRY.
                # Append dummy token  so we now have <tokens>, x
                tokens.append('_')
                token_count = len(tokens)
                self.result_type = GeoUtil.Result.NO_COUNTRY
                self.country_name = ''
            # self.logger.debug(f'ISO =[{self.country_iso}]')
        if token_count > 1:
            #  See if 2nd to last token is Admin1
            val = tokens[-2]
            self.logger.debug(f'Get ADM1 from tkn-2 [{val}]')
            self.admin1_name = self.norm.admin1_normalize(
                val, self.country_iso)

            if len(self.admin1_name) > 0:
                # Lookup Admin1
                self.logger.debug(
                    f'2) Find ADMIN1 [{self.admin1_name}] *******')
                row_list = []
                self.admin1_id = geo_db.s.get_admin1_id(
                    self.admin1_name, self.country_iso)
                if self.admin1_id != '':
                    # Found Admin1
                    self.place_type = PlaceType.ADMIN1
                    self.georow_list = row_list

                    self.admin1_name = geo_db.s.get_admin1_name(
                        self.admin1_id, self.country_iso)
                    # self.logger.debug(f'adm1 nm=[{self.admin1_name}]\nGet ISO')
                    self.logger.debug(
                        f'2) Find iso for admin1 id [{self.admin1_id}] *******'
                    )

                    self.country_iso = geo_db.s.get_iso_from_admin1_id(
                        self.admin1_id, self.country_iso)

                    self.result_type = GeoUtil.Result.PARTIAL_MATCH
                    # Get country if blank
                    row_list = []
                    if self.country_name == '':
                        self.country_name = geo_db.s.get_country_name(
                            self.country_iso)
                else:
                    # Last token is not Admin1 - append dummy token so we have <tokens>, admin1, country
                    self.admin1_name = ''
                    # Add dummy token for admin1 position
                    tokens.insert(-1, '_')
                    # token_count = len(tokens)
            else:
                tokens[-2] = '_'

        # Last two tokens are now Admin1, Country (although they may have dummy value '_')
        # If >2 tokens:  Put first non-blank token in City and in Prefix
        # If >3 tokens:  Put second non-blank token in Admin2 and also append to Prefix

        # Remove all blank tokens
        tokens = [x for x in tokens if x]
        token_count = len(tokens)

        if token_count >= 3:
            #  Possible Formats: City, Admin1, Country or  Admin2, Admin1, Country
            #  Take first tkn as city
            self.city = self.norm.normalize(tokens[0], False)
            self.place_type = PlaceType.CITY

            # Also place token[0] into Prefix
            if '*' not in tokens[0]:
                self.prefix = str(tokens[0].strip(' '))

        if token_count >= 4:
            #  Admin2 is 2nd.  Note -  if Admin2 isnt found, it will look it up as city

            if GeoUtil.is_street(tokens[-4].lower()):
                #  Format: Prefix, City, Admin1, Country
                self.city = self.norm.normalize(tokens[-3], False)
            else:
                #  Format: City, Admin2, Admin1, Country
                self.admin2_name = self.norm.normalize(tokens[-3], False)
                self.city = self.norm.normalize(tokens[-4], False)

            self.place_type = PlaceType.CITY

            # put token[0] and  token[1] into Prefix
            if '*' not in tokens[1]:
                self.prefix = str(tokens[0].strip(' ')) + ' ' + str(
                    tokens[1].strip(' '))

        self.prefix = self.norm.normalize(self.prefix, False)
        row_list = []
        # fill in country name if still missing - finding Admin1 will find country ISO
        if self.country_name == '' and self.country_iso != '':
            self.country_name = geo_db.s.get_country_name(self.country_iso)

        self.logger.debug(
            f"    ======= PARSED: {place_name} \nCity [{self.city}] Adm2 [{self.admin2_name}]"
            f" Adm1 [{self.admin1_name}] adm1_id [{self.admin1_id}] Cntry [{self.country_name}] Pref=[{self.prefix}]"
            f" type_id={self.place_type}\n")
        return