Beispiel #1
0
 def get_soundex_by_word(text: str) -> str:
     result_sdx = ''
     text_words = text.split(' ')
     for word in text_words:
         if len(result_sdx) == 0:
             result_sdx = GeoSearch.get_soundex(Loc.sort_words(word))
         else:
             result_sdx += ' ' + GeoSearch.get_soundex(Loc.sort_words(word))
     return result_sdx
Beispiel #2
0
    def prefix_cleanup(pref: str, result: str) -> str:
        """
        Remove any items from prefix that are in match result.  Remove *   
        #Args:   
            pref:   
            result:   

        #Returns:  Prefix with words removed   
        """
        new_prfx = pref.lower()
        new_prfx = new_prfx.strip(' ')
        prefix_parts = new_prfx.split(',')

        result = result.lower()
        result_parts = result.split(',')

        # Walk thru each segment in result
        for result_segment_idx, result_segment in enumerate(result_parts):
            result_sdx = ' ' + Loc.get_soundex_by_word(result_segment) + ' '
            result_segment = ' ' + result_segment + ' '
            # Walk thru each segment in prefix
            for prefix_segment_idx, prefix_segment in enumerate(prefix_parts):
                prefix_words = prefix_segment.split(' ')
                # Walk thru each word in prefix segment
                for pref_word_idx, prefix_word in enumerate(prefix_words):
                    prefix_sdx = ' ' + GeoSearch.get_soundex(prefix_word) + ' '
                    if len(prefix_word) < 3:
                        prefix_word += ' '
                    # Remove words in prefix that are in result_segment
                    if (prefix_word in result_segment
                            and prefix_word != '') or (prefix_sdx in result_sdx
                                                       and prefix_sdx != ''):
                        result_segment = remove_item(prefix_word.strip(' '),
                                                     result_segment)
                        new_prfx = remove_item(prefix_word.strip(' '),
                                               new_prfx)
                        pass
                    # See if any words in result_segment are in prefix
                    result_words = result_segment.split(' ')
                    if len(prefix_word) > 0:
                        # Walk through each word in result
                        for result_word_idx, result_word in enumerate(
                                result_words):
                            if len(result_word) < 3:
                                result_word = result_word + ' '
                            if result_word in prefix_word and float(
                                    len(result_word)) / float(
                                        len(prefix_word)) > 0.6:
                                # Remove result_word from result and prefix
                                result_segment = remove_item(
                                    result_word, result_segment)
                                new_prfx = remove_item(result_word, new_prfx)

        res = re.sub('[,]', '', new_prfx)
        res = res.strip(' ')
        res = res.strip(',')
        return res
Beispiel #3
0
    def insert_alternate_name(self, alternate_name: str, geoid: str,
                              lang: str):
        """
        Add alternate name to altname table
        #Args:   
            alternate_name: alternate name to add for this geoid   
            geoid: geonames.org geoid   
            lang: ISO lang code for this entry   

        #Returns: None   

        """
        sdx = GeoSearch.get_soundex(self.norm.normalize(alternate_name, True))
        row = (alternate_name, lang, geoid, sdx)
        sql = ''' INSERT OR IGNORE INTO altname(name,lang, geoid, sdx)
                  VALUES(?,?,?,?) '''
        self.geodb.db.execute(sql, row)
Beispiel #4
0
    def update_geo_row_name(self, geo_row: [], name: str, normalize=True):
        """
            Update the name entry and soundex entry with a new location name
        #Args:
            geo_row: 
            name: location name
        """
        if normalize:
            geo_row[GeoSearch.Entry.NAME] = self.norm.normalize(
                name, remove_commas=True)
            # todo remove test
            #if 'saint ' in geo_row[GeoSearch.Entry.NAME]:
            #    print(geo_row)
            #    xx = xyz
        else:
            geo_row[GeoSearch.Entry.NAME] = name.lower()
            #if 'saint ' in geo_row[GeoSearch.Entry.NAME]:
            #    print(geo_row)
            #    xx = xyz

        geo_row[GeoSearch.Entry.SDX] = GeoSearch.get_soundex(
            geo_row[GeoSearch.Entry.NAME])
Beispiel #5
0
    def __init__(self, db_path, show_message: bool, exit_on_error: bool,
                 set_speed_pragmas: bool, db_limit: int):
        """
            geoname data database init. Open database if present otherwise raise error
        # Args:
            db_path: full path to database file
            show_message: If True, show messagebox to user on error
            exit_on_error: If True, exit if significant error occurs
            set_speed_pragmas: If True, set DB pragmas for maximum performance. 
            db_limit: SQL LIMIT parameter
        # Raises:
            ValueError('Cannot open database'), ValueError('Database empty or corrupt')
        """
        self.logger = logging.getLogger(__name__)
        self.show_message = show_message
        self.exit_on_error = exit_on_error
        self.max_query_results = 50
        self.total_time = 0
        self.total_lookups = 0
        self.slow_lookup = 0
        self.match = MatchScore.MatchScore()
        self.norm = Normalize.Normalize()

        #self.select_str = 'name, country, admin1_id, admin2_id, lat, lon, feature, geoid, sdx'
        self.db_path = db_path

        # See if DB exists
        if os.path.exists(db_path):
            db_existed = True
        else:
            db_existed = False

        self.db = DB.DB(db_filename=db_path,
                        show_message=show_message,
                        exit_on_error=exit_on_error)
        if self.db.err != '':
            self.logger.error(f"Error! cannot open database {db_path}.")
            raise ValueError('Cannot open database')

        # If DB was initially found
        if db_existed:
            # Run sanity test on DB
            err = self.db.test_database('name',
                                        'main.geodata',
                                        where='name = ? AND country = ?',
                                        args=('ba', 'fr'))

            if err:
                # DB failed sanity test
                self.logger.warning(f'DB error for {db_path}')

                if show_message:
                    if messagebox.askyesno(
                            'Error',
                            f'Geoname database is empty or corrupt:\n\n {db_path} \n\nDo you want to delete it and rebuild?'
                    ):
                        messagebox.showinfo('', 'Deleting Geoname database')
                        self.db.conn.close()
                        os.remove(db_path)
                if exit_on_error:
                    sys.exit()
                else:
                    raise ValueError('Database empty or corrupt')

        if set_speed_pragmas:
            self.db.set_speed_pragmas()

        self.db_limit = db_limit
        self.db.order_string = ''
        self.db.limit_string = f'LIMIT {self.db_limit}'
        self.place_type = ''
        self.s: GeoSearch.GeoSearch = GeoSearch.GeoSearch(geodb=self)
Beispiel #6
0
    def _weighted_score(self, target_tokens: [], result_tokens: []) -> float:
        num_inp_tokens = 0.0
        score = 0.0
        bonus = 0.0

        token_weight = copy.copy(self.token_weight)

        # Set weighting to half if target token for that term is blank
        #for idx, item in enumerate(target_tokens):
        #    if len(item) == 0:
        #        token_weight[idx] *= 0.5

        #self.logger.debug(f'[{target_tokens}]  [{result_tokens}]')

        # Calculate difference each target segment to result segment (city, county, state/province, country)
        # Each segment can have a different weighting.  e.g. county can have lower weighting
        for idx, segment in enumerate(target_tokens):
            if idx < len(result_tokens) and idx > 0:
                #self.logger.debug(f'{idx}) Targ [{target_tokens[idx]}] Res [{result_tokens[idx]}]')

                if len(target_tokens[idx]) > 0:
                    # Calculate fuzzy Levenstein distance between words, smaller is better
                    start = time.time()
                    fz = 100.0 - fuzz.ratio(target_tokens[idx],
                                            result_tokens[idx])
                    self.timing += (time.time() - start)
                    # Calculate fuzzy Levenstein distance between Soundex of words
                    sd = 100.0 - fuzz.ratio(
                        GeoSearch.get_soundex(target_tokens[idx]),
                        GeoSearch.get_soundex(result_tokens[idx]))
                    value = fz * 0.6 + sd * 0.4
                    #self.logger.debug(f'    Val={value} Fuzz Text={fz:.1f} SDX={sd:.1f}')
                    # Extra bonus for good match
                    if value < 10:
                        #self.logger.debug('   Good match -10')
                        value -= 6
                    if target_tokens[idx][0:5] == result_tokens[idx][0:5]:
                        # Bonus if first letters match
                        #self.logger.debug('   First letter match -5')
                        value -= 3
                elif len(result_tokens[idx]) > 0:
                    # Target had no entry for this term
                    # Give a penalty if target didn't have country
                    if idx == COUNTRY_IDX:
                        value = 15.0
                    elif idx == ADMIN1_IDX:
                        # Give a penalty if target didn't have state/province
                        value = 5.0
                    else:
                        value = 5.0
                    #self.logger.debug(txt)
                else:
                    if idx == COUNTRY_IDX:
                        value = 15.0
                    elif idx == ADMIN1_IDX:
                        # Give a penalty if target didn't have state/province
                        value = 5.0
                    else:
                        value = 0

                score += value * token_weight[idx]
                num_inp_tokens += token_weight[idx]
                self.score_diags += f'  {idx}) {value:.1f} [{result_tokens[idx]}]'
            else:
                #self.logger.warning(f'Short Result len={len(result_tokens)} targ={target_tokens[idx]}')
                pass

        # Average over number of tokens (with fractional weight).  Gives 0-100 regardless of weighting and number of tokens
        if num_inp_tokens > 0:
            score = (score / num_inp_tokens)
        else:
            score = 0

        return score + bonus
Beispiel #7
0
    def handle_line(self, line_num, row):
        """
        For each line in file, add item to alternate name DB if we support that language
        Also add to main DB if lang is not English and item is not an ADM item
        :param line_num:  file line number
        :param row: line in file to be handled
        :return: None
        """
        if self.search == None:
            # Create search instance
            self.search = GeoSearch.GeoSearch(self.geo_build.geodb)

        alt_tokens = row.split('\t', maxsplit=4)
        if len(alt_tokens) != 5:
            self.logger.debug(
                f'Incorrect number of tokens {len(alt_tokens)}: {alt_tokens} line {line_num}'
            )
            return

        self.place.georow_list = []
        if alt_tokens[ALT_LANG] == '':
            alt_tokens[ALT_LANG] = 'en'

        # Only add if lang is in requested lang list
        if alt_tokens[ALT_LANG] in self.lang_list:
            # Only Add this alias if  DB  has an entry for it (since geoname DB is filtered )

            # Check Main DB - see if item has an entry with same GEOID
            dbid = self.geo_build.geoid_main_dict.get(alt_tokens[ALT_GEOID])
            if dbid is not None:
                # Retrieve entry
                self.search.lookup_dbid(self.place.georow_list,
                                        dbid,
                                        place=self.place,
                                        admin=False)
            else:
                # Check Admin DB - see if item has an entry with same GEOID
                dbid = self.geo_build.geoid_admin_dict.get(
                    alt_tokens[ALT_GEOID])
                if dbid is not None:
                    # Retrieve entry
                    self.search.lookup_dbid(self.place.georow_list,
                                            dbid,
                                            place=self.place,
                                            admin=True)

            if len(self.place.georow_list) > 0:
                # Create an entry in the alternate name DB  with this name and soundex

                # Convert row to list. modify name and soundex
                # Update the name in the new row with the alternate name
                update = list(self.place.georow_list[0][0:GeoSearch.Entry.SDX +
                                                        1])

                # Make sure this entry has a different name from existing entry
                if update[
                        GeoSearch.Entry.NAME] != alt_tokens[ALT_NAME].lower():
                    self.geo_build.update_geo_row_name(
                        geo_row=update, name=alt_tokens[ALT_NAME])
                    new_row = tuple(update)  # Convert back to tuple

                    if 'ADM1' not in update[
                            GeoSearch.Entry.FEAT] and 'ADM2' not in update[
                                GeoSearch.Entry.FEAT]:
                        #  Add to main DB if not English or not ADM1/ADM2
                        self.geo_build.insert(
                            geo_tuple=new_row,
                            feat_code=update[GeoSearch.Entry.FEAT])
                        self.count += 1

                    # Add name to altnames table
                    self.geo_build.insert_alternate_name(
                        alt_tokens[ALT_NAME], alt_tokens[ALT_GEOID],
                        alt_tokens[ALT_LANG])
                    self.count += 1