Exemple #1
0
    def __init__(self, components, country_rtree, debug=False):
        self.components = components
        self.country_rtree = country_rtree

        self.debug = debug

        self.formatter = AddressFormatter()
Exemple #2
0
    def __init__(self, geoplanet_db):
        self.db = sqlite3.connect(geoplanet_db)

        # These aren't too large and it's easier to have them in memory
        self.places = {
            row[0]: row[1:]
            for row in self.db.execute('select * from places')
        }
        self.aliases = defaultdict(list)

        self.coterminous_admins = {}
        self.admins_with_ambiguous_city = set()

        print('Doing admin ambiguities')
        for row in self.db.execute('''select p.id,
                                             (select count(*) from places where parent_id = p.id) as num_places,
                                             (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
                                             p2.id
                                      from places p
                                      join places p2
                                          on p2.parent_id = p.id
                                          and p.name = p2.name
                                          and p.place_type != "Town"
                                          and p2.place_type = "Town"
                                      group by p.id'''):
            place_id, num_places, num_towns, coterminous_town_id = row
            num_places = int(num_places)
            num_towns = int(num_towns)

            if num_places == 1 and num_towns == 1:
                self.coterminous_admins[place_id] = coterminous_town_id
            self.admins_with_ambiguous_city.add(place_id)

        print('num coterminous: {}'.format(len(self.coterminous_admins)))
        print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))

        print('Doing aliases')
        for row in self.db.execute('''select a.* from aliases a
                                      left join places p
                                          on a.id = p.id
                                          and p.place_type in ("State", "County")
                                          and a.language != p.language
                                      where name_type != "S" -- no colloquial aliases like "The Big Apple"
                                      and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
                                      and p.id is NULL -- exclude foreign-language states/county names
                                      order by id, language,
                                      case name_type
                                          when "P" then 1
                                          when "Q" then 2
                                          when "V" then 3
                                          when "A" then 4
                                          when "S" then 5
                                          else 6
                                      end'''):
            place = self.places.get(row[0])
            if not place:
                continue

            self.aliases[row[0]].append(row[1:])

        print('Doing variant aliases')
        variant_aliases = 0
        for i, row in enumerate(
                self.db.execute(
                    '''select a.*, p.name, p.country_code from aliases a
                                                   join places p using(id)
                                                   where a.name_type = "V"
                                                   and a.language = p.language'''
                )):
            place_name, country_code = row[-2:]
            country = country_code.lower()

            row = row[:-2]
            place_id, alias, name_type, language = row

            language = self.language_codes[language]
            if language != 'unk':
                alias_sans_affixes = name_affixes.replace_affixes(
                    alias, language, country=country)
                if alias_sans_affixes:
                    alias = alias_sans_affixes

                place_name_sans_affixes = name_affixes.replace_affixes(
                    place_name, language, country=country)
                if place_name_sans_affixes:
                    place_name = place_name_sans_affixes
            else:
                language = None

            if equivalent(place_name, alias, toponym_abbreviations_gazetteer,
                          language):
                self.aliases[row[0]].append(row[1:])
                variant_aliases += 1

            if i % 10000 == 0 and i > 0:
                print('tested {} variant aliases with {} positives'.format(
                    i, variant_aliases))

        self.aliases = dict(self.aliases)

        self.formatter = AddressFormatter()
Exemple #3
0
 def __init__(self):
     self.formatter = AddressFormatter()