def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, infile, out_dir, tag_components=True): ''' Creates formatted address training data for supervised sequence labeling (or potentially for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. Example: cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country The field structure is similar to other training data created by this script i.e. {language, country, data}. The data field here is a sequence of labeled tokens similar to what we might see in part-of-speech tagging. This format uses a special character "|" to denote possible breaks in the input (comma, newline). Note that for the address parser, we'd like it to be robust to many different types of input, so we may selectively eleminate components This information can potentially be used downstream by the sequence model as these breaks may be present at prediction time. Example: sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic This may be useful in learning word representations, statistical phrases, morphology or other models requiring only the sequence of words. ''' i = 0 formatter = AddressFormatter() osm_address_components.configure() if tag_components: formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_file, 'tsv_no_quote') remove_keys = OSM_IGNORE_KEYS for node_id, value, deps in parse_osm(infile): try: latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) except Exception: continue country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude) if not (country and candidate_languages): continue for key in remove_keys: _ = value.pop(key, None) language = None if tag_components: if len(candidate_languages) == 1: language = candidate_languages[0]['lang'] else: street = value.get('addr:street', None) if street is not None: language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) else: language = UNKNOWN_LANGUAGE address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases} formatter.replace_aliases(address_components) address_country = address_components.get(AddressFormatter.COUNTRY) ''' Country names ------------- In OSM, addr:country is almost always an ISO-3166 alpha-2 country code. However, we'd like to expand these to include natural language forms of the country names we might be likely to encounter in a geocoder or handwritten address. These splits are somewhat arbitrary but could potentially be fit to data from OpenVenues or other sources on the usage of country name forms. If the address includes a country, the selection procedure proceeds as follows: 1. With probability a, select the country name in the language of the address (determined above), or with the localized country name if the language is undtermined or ambiguous. 2. With probability b(1-a), sample a language from the distribution of languages on the Internet and use the country's name in that language. 3. This is implicit, but with probability (1-b)(1-a), keep the country code ''' non_local_language = None if random.random() < 0.3: # 30% of the time: add Quattroshapes country address_country = country.upper() r = random.random() # 1. 60% of the time: use the country name in the current language or the country's local language if address_country and r < 0.6: localized = None if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): localized = language_country_names.get(language, {}).get(address_country.upper()) if not localized: localized = country_localized_display_name(address_country.lower()) if localized: address_components[AddressFormatter.COUNTRY] = localized # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet elif address_country and r < 0.7: non_local_language = sample_random_language() lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper()) if lang_country: address_components[AddressFormatter.COUNTRY] = lang_country # 3. Implicit: the rest of the time keep the country code ''' States ------ Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name whereas we'd like to include both forms, so wtih some probability, replace the abbreviated name with the unabbreviated one e.g. CA => California ''' address_state = address_components.get(AddressFormatter.STATE) if address_state: state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language) if state_full_name and random.random() < 0.3: address_components[AddressFormatter.STATE] = state_full_name ''' OSM boundaries -------------- For many addresses, the city, district, region, etc. are all implicitly generated by the reverse geocoder e.g. we do not need an addr:city tag to identify that 40.74, -74.00 is in New York City as well as its parent geographies (New York county, New York state, etc.) Where possible we augment the addr:* tags with some of the reverse-geocoded relations from OSM. Since addresses found on the web may have the same properties, we include these qualifiers in the training data. ''' osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude) if osm_components: if non_local_language is not None: suffix = ':{}'.format(non_local_language) else: suffix = '' name_key = ''.join(('name', suffix)) raw_name_key = 'name' short_name_key = ''.join(('short_name', suffix)) raw_short_name_key = 'short_name' simple_name_key = 'name:simple' international_name_key = 'int_name' alt_name_key = ''.join(('alt_name', suffix)) raw_alt_name_key = 'alt_name' official_name_key = ''.join(('official_name', suffix)) raw_official_name_key = 'official_name' iso_code_key = 'ISO3166-1:alpha2' iso_code3_key = 'ISO3166-1:alpha3' poly_components = defaultdict(list) for component, components_values in osm_components.iteritems(): seen = set() # Choose which name to use with given probabilities r = random.random() if r < 0.7: # 70% of the time use the name tag key = name_key raw_key = raw_name_key elif r < 0.8: # 10% of the time use the short name key = short_name_key raw_key = raw_short_name_key elif r < 0.9: # 10% of the time use the official name key = official_name_key raw_key = raw_official_name_key else: # 10% of the time use the official name key = alt_name_key raw_key = raw_alt_name_key for component_value in components_values: r = random.random() name = None if iso_code3_key in component_value and r < 0.1: name = component_value[iso_code3_key] elif iso_code_key in component_value and r < 0.3: name = component_value[iso_code_key] elif language == 'en' and not non_local_language and r < 0.7: # Particularly to address the US (prefer United States, # not United States of America) but may capture variations # in other English-speaking countries as well. if simple_name_key in component_value: name = component_value[simple_name_key] elif international_name_key in component_value: name = component_value[international_name_key] if not name: name = component_value.get(key, component_value.get(raw_key)) if not name: name = component_value.get(name_key, component_value.get(raw_name_key)) if not name: continue if (component, name) not in seen: poly_components[component].append(name) seen.add((component, name)) for component, vals in poly_components.iteritems(): if component not in address_components or non_local_language: val = u', '.join(vals) if component == AddressFormatter.STATE and random.random() < 0.7: val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) address_components[component] = val ''' Neighborhoods ------------- In some cities, neighborhoods may be included in a free-text address. OSM includes many neighborhoods but only as points, rather than the polygons needed to perform reverse-geocoding. We use a hybrid index containing Quattroshapes/Zetashapes polygons matched fuzzily with OSM names (which are on the whole of better quality). ''' neighborhood = neighborhoods_rtree.point_in_poly(latitude, longitude) if neighborhood and AddressFormatter.SUBURB not in address_components: address_components[AddressFormatter.SUBURB] = neighborhood['name'] # Version with all components formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) if tag_components: formatted_addresses = [] formatted_addresses.append(formatted_address) address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} if not address_components: continue current_components = component_bitset(address_components.keys()) for component in address_components.keys(): if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5: address_components.pop(component) current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component] if not address_components: break formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) formatted_addresses.append(formatted_address) for formatted_address in formatted_addresses: if formatted_address and formatted_address.strip(): formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue row = (language, country, formatted_address) writer.writerow(row) elif formatted_address and formatted_address.strip(): formatted_address = tsv_string(formatted_address) writer.writerow([formatted_address]) i += 1 if i % 1000 == 0 and i > 0: print 'did', i, 'formatted addresses'
def build_address_format_training_data_limited(language_rtree, infile, out_dir): ''' Creates a special kind of formatted address training data from OSM's addr:* tags but are designed for use in language classification. These records are similar to the untagged formatted records but include the language and country (suitable for concatenation with the rest of the language training data), and remove several fields like country which usually do not contain helpful information for classifying the language. Example: nb no Olaf Ryes Plass 8 | Oslo ''' i = 0 formatter = AddressFormatter() f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w') writer = csv.writer(f, 'tsv_no_quote') remove_keys = NAME_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS for key, value, deps in parse_osm(infile): try: latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) except Exception: continue for k in remove_keys: _ = value.pop(k, None) if not value: continue country, name_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street') if not name_language: continue single_language = len(name_language) == 1 for lang, val in name_language.iteritems(): if lang not in languages: continue address_dict = value.copy() for k in address_dict.keys(): namespaced_val = u'{}:{}'.format(k, lang) if namespaced_val in address_dict: address_dict[k] = address_dict[namespaced_val] elif not single_language: address_dict.pop(k) if not address_dict: continue formatted_address_untagged = formatter.format_address(country, address_dict, tag_components=False) if formatted_address_untagged is not None: formatted_address_untagged = tsv_string(formatted_address_untagged) writer.writerow((lang, country, formatted_address_untagged)) i += 1 if i % 1000 == 0 and i > 0: print 'did', i, 'formatted addresses'
def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, infile, out_dir, tag_components=True): ''' Creates formatted address training data for supervised sequence labeling (or potentially for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. Example: cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country The field structure is similar to other training data created by this script i.e. {language, country, data}. The data field here is a sequence of labeled tokens similar to what we might see in part-of-speech tagging. This format uses a special character "|" to denote possible breaks in the input (comma, newline). Note that for the address parser, we'd like it to be robust to many different types of input, so we may selectively eleminate components This information can potentially be used downstream by the sequence model as these breaks may be present at prediction time. Example: sr rs Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic This may be useful in learning word representations, statistical phrases, morphology or other models requiring only the sequence of words. ''' i = 0 formatter = AddressFormatter() osm_address_components.configure() if tag_components: formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_file, 'tsv_no_quote') remove_keys = OSM_IGNORE_KEYS alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries} for node_id, value, deps in parse_osm(infile): try: latitude, longitude = latlon_to_decimal(value['lat'], value['lon']) except Exception: continue country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude) if not (country and candidate_languages): continue for key in remove_keys: _ = value.pop(key, None) language = None more_than_one_official_language = len(candidate_languages) > 1 if tag_components: if len(candidate_languages) == 1: language = candidate_languages[0]['lang'] else: street = value.get('addr:street', None) namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value] if street is not None and not namespaced: language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) elif namespaced and random.random() < 0.6: language = random.choice(namespaced) lang_suffix = ':{}'.format(language) for k in value: if k.startswith('addr:') and k.endswith(lang_suffix): value[k.rstrip(lang_suffix)] = value[k] else: language = UNKNOWN_LANGUAGE address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases} formatter.replace_aliases(address_components) address_country = address_components.get(AddressFormatter.COUNTRY) ''' Country names ------------- In OSM, addr:country is almost always an ISO-3166 alpha-2 country code. However, we'd like to expand these to include natural language forms of the country names we might be likely to encounter in a geocoder or handwritten address. These splits are somewhat arbitrary but could potentially be fit to data from OpenVenues or other sources on the usage of country name forms. If the address includes a country, the selection procedure proceeds as follows: 1. With probability a, select the country name in the language of the address (determined above), or with the localized country name if the language is undtermined or ambiguous. 2. With probability b(1-a), sample a language from the distribution of languages on the Internet and use the country's name in that language. 3. This is implicit, but with probability (1-b)(1-a), keep the country code ''' non_local_language = None if random.random() < 0.3: # 30% of the time: add Quattroshapes country address_country = country.upper() r = random.random() # 1. 60% of the time: use the country name in the current language or the country's local language if address_country and r < 0.6: localized = None if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): localized = language_country_names.get(language, {}).get(address_country.upper()) if not localized: localized = country_localized_display_name(address_country.lower()) if localized: address_components[AddressFormatter.COUNTRY] = localized # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet elif address_country and r < 0.7: non_local_language = sample_random_language() lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper()) if lang_country: address_components[AddressFormatter.COUNTRY] = lang_country # 3. 10% of the time: use the country's alpha-3 ISO code elif address_country and r < 0.8: iso_code_alpha3 = alpha3_codes.get(address_country) if iso_code_alpha3: address_components[AddressFormatter.COUNTRY] = iso_code_alpha3 # 4. Implicit: the rest of the time keep the alpha-2 country code ''' Venue names ----------- Some venues have multiple names listed in OSM, grab them all ''' venue_names = [] for key in ('name', 'alt_name', 'loc_name', 'int_name', 'old_name'): venue_name = value.get(key) if venue_name: venue_names.append(venue_name) ''' States ------ Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name whereas we'd like to include both forms, so wtih some probability, replace the abbreviated name with the unabbreviated one e.g. CA => California ''' address_state = address_components.get(AddressFormatter.STATE) if address_state and not non_local_language: state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language) if state_full_name and random.random() < 0.3: address_components[AddressFormatter.STATE] = state_full_name elif address_state and non_local_language: _ = address_components.pop(AddressFormatter.STATE, None) ''' OSM boundaries -------------- For many addresses, the city, district, region, etc. are all implicitly generated by the reverse geocoder e.g. we do not need an addr:city tag to identify that 40.74, -74.00 is in New York City as well as its parent geographies (New York county, New York state, etc.) Where possible we augment the addr:* tags with some of the reverse-geocoded relations from OSM. Since addresses found on the web may have the same properties, we include these qualifiers in the training data. ''' osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude) if non_local_language is not None: osm_suffix = ':{}'.format(non_local_language) elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): osm_suffix = ':{}'.format(language) else: osm_suffix = '' name_key = ''.join(('name', osm_suffix)) raw_name_key = 'name' simple_name_key = 'name:simple' international_name_key = 'int_name' iso_code_key = 'ISO3166-1:alpha2' iso_code3_key = 'ISO3166-1:alpha3' if osm_components: poly_components = defaultdict(list) existing_city_name = address_components.get(AddressFormatter.CITY) for component, components_values in osm_components.iteritems(): seen = set() key, raw_key = osm_pick_random_name_key(suffix=osm_suffix) for component_value in components_values: r = random.random() name = None if iso_code3_key in component_value and r < 0.1: name = component_value[iso_code3_key] elif iso_code_key in component_value and r < 0.3: name = component_value[iso_code_key] elif language == 'en' and not non_local_language and r < 0.7: # Particularly to address the US (prefer United States, # not United States of America) but may capture variations # in other English-speaking countries as well. if simple_name_key in component_value: name = component_value[simple_name_key] elif international_name_key in component_value: name = component_value[international_name_key] if not name: name = component_value.get(key, component_value.get(raw_key)) if not name or (component != AddressFormatter.CITY and name == existing_city_name): name = component_value.get(name_key, component_value.get(raw_name_key)) if not name or (component != AddressFormatter.CITY and name == existing_city_name): continue if (component, name) not in seen: poly_components[component].append(name) seen.add((component, name)) for component, vals in poly_components.iteritems(): if component not in address_components or (non_local_language and random.random() < 0.4): if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5: num = random.randrange(1, len(vals) + 1) val = u', '.join(vals[:num]) else: val = random.choice(vals) if component == AddressFormatter.STATE and random.random() < 0.7: val = STATE_EXPANSIONS.get(address_country, {}).get(val, val) address_components[component] = val ''' Quattroshapes/GeoNames cities ----------------------------- Quattroshapes isn't great for everything, but it has decent city boundaries in places where OSM sometimes does not (or at least in places where we aren't currently able to create valid polygons). While Quattroshapes itself doesn't reliably use local names, which we'll want for consistency ''' if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < 0.2): lang = non_local_language or language quattroshapes_cities = quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True) for result in quattroshapes_cities: if result.get(quattroshapes_rtree.LEVEL) == quattroshapes_rtree.LOCALITY and quattroshapes_rtree.GEONAMES_ID in result: geonames_id = int(result[quattroshapes_rtree.GEONAMES_ID].split(',')[0]) names = geonames.get_alternate_names(geonames_id) if not names or lang not in names: continue city = None if 'abbr' not in names or non_local_language: # Use the common city name in the target language city = names[lang][0][0] elif random.random() < 0.1: # Use an abbreviation: NYC, BK, SF, etc. city = random.choice(names['abbr'])[0] if not city or not city.strip(): continue address_components[AddressFormatter.CITY] = city break else: if non_local_language and AddressFormatter.CITY in address_components and ( AddressFormatter.CITY_DISTRICT in osm_components or AddressFormatter.SUBURB in osm_components): address_components.pop(AddressFormatter.CITY) ''' Neighborhoods ------------- In some cities, neighborhoods may be included in a free-text address. OSM includes many neighborhoods but only as points, rather than the polygons needed to perform reverse-geocoding. We use a hybrid index containing Quattroshapes/Zetashapes polygons matched fuzzily with OSM names (which are on the whole of better quality). ''' neighborhoods = neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True) neighborhood_levels = defaultdict(list) for neighborhood in neighborhoods: place_type = neighborhood.get('place') polygon_type = neighborhood.get('polygon_type') key, raw_key = osm_pick_random_name_key(suffix=osm_suffix) name = neighborhood.get(key, neighborhood.get(raw_key)) if not name: name = neighborhood.get(name_key, neighborhood.get(raw_name_key)) name_prefix = neighborhood.get('name:prefix') if name_prefix and random.random() < 0.5: name = u' '.join([name_prefix, name]) if not name: continue neighborhood_level = AddressFormatter.SUBURB if place_type == 'borough' or polygon_type == 'local_admin': neighborhood_level = AddressFormatter.CITY_DISTRICT # Optimization so we don't use e.g. Brooklyn multiple times city_name = address_components.get(AddressFormatter.CITY) if name == city_name: name = neighborhood.get(name_key, neighborhood.get(raw_name_key)) if not name or name == city_name: continue neighborhood_levels[neighborhood_level].append(name) for component, neighborhoods in neighborhood_levels.iteritems(): if component not in address_components and random.random() < 0.5: address_components[component] = neighborhoods[0] ''' Name normalization ------------------ Probabilistically strip standard prefixes/suffixes e.g. "London Borough of" ''' for component in BOUNDARY_COMPONENTS: name = address_components.get(component) if not name: continue replacement = replace_name_prefixes(replace_name_suffixes(name)) if replacement != name and random.random() < 0.6: address_components[component] = replacement ''' Name deduping ------------- For some cases like "Antwerpen, Antwerpen, Antwerpen" that are very unlikely to occur in real life. ''' name_components = defaultdict(list) for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB): name = address_components.get(component) if name: name_components[name].append(component) for name, components in name_components.iteritems(): if len(components) > 1: for component in components[1:]: address_components.pop(component, None) ''' House number cleanup -------------------- For some OSM nodes, particularly in Uruguay, we get house numbers that are actually a comma-separated list. If there's one comma in the house number, allow it as it might be legitimate, but if there are 2 or more, just take the first one. ''' house_number = address_components.get(AddressFormatter.HOUSE_NUMBER) if house_number and house_number.count(',') >= 2: for num in house_number.split(','): num = num.strip() if num: address_components[AddressFormatter.HOUSE_NUMBER] = num break else: address_components.pop(AddressFormatter.HOUSE_NUMBER, None) # Version with all components formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components) if tag_components: formatted_addresses = [] formatted_addresses.append(formatted_address) seen = set([formatted_address]) address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES} if not address_components: continue current_components = address_components.keys() random.shuffle(current_components) component_set = component_bitset(address_components.keys()) for component in current_components: if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5: address_components.pop(component) component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component] if not address_components: break # Since venue names are 1-per-record, we must use them all for venue_name in (venue_names or [None]): if venue_name and AddressFormatter.HOUSE in address_components: address_components[AddressFormatter.HOUSE] = venue_name formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False) if formatted_address not in seen: formatted_addresses.append(formatted_address) seen.add(formatted_address) for formatted_address in formatted_addresses: if formatted_address and formatted_address.strip(): formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue row = (language, country, formatted_address) writer.writerow(row) elif formatted_address and formatted_address.strip(): formatted_address = tsv_string(formatted_address) writer.writerow([formatted_address]) i += 1 if i % 1000 == 0 and i > 0: print 'did', i, 'formatted addresses'
class OpenAddressesFormatter(object): field_regex_replacements = { # All fields None: [ (re.compile('<\s*null\s*>', re.I), u''), (re.compile('[\s]{2,}'), six.u(' ')), (re.compile('\`'), u"'"), (re.compile('\-?\*'), u""), ], AddressFormatter.HOUSE_NUMBER: [ # Most of the house numbers in Montreal start with "#" (re.compile('^#', re.UNICODE), u''), # Some house numbers have multiple hyphens (re.compile('[\-]{2,}'), u'-'), # Some house number ranges are split up like "12 -14" (re.compile('[\s]*\-[\s]*'), u'-'), ] } unit_type_regexes = {} for (lang, dictionary_type), values in six.iteritems( address_phrase_dictionaries.phrases): if dictionary_type == 'unit_types_numbered': unit_phrases = [ safe_encode(p) for p in itertools.chain(*values) if len(p) > 2 ] pattern = re.compile( r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$' .format(safe_encode('|').join(unit_phrases)), re.I | re.UNICODE) unit_type_regexes[lang] = pattern def __init__(self, components, country_rtree, debug=False): self.components = components self.country_rtree = country_rtree self.debug = debug self.formatter = AddressFormatter() class validators: @classmethod def validate_postcode(cls, postcode): ''' Postcodes that are all zeros are improperly-formatted NULL values ''' return not all((c in ('0', '-', '.', ' ', ',') for c in postcode)) @classmethod def validate_street(cls, street): ''' Streets should not be simple numbers. If they are it's probably a copy/paste error and should be the house number. ''' return not is_numeric(street) @classmethod def validate_house_number(cls, house_number): ''' House number doesn't necessarily have to be numeric, but in some of the OpenAddresses data sets the house number field is equal to the capitalized street name, so this at least provides protection against insane values for house number at the cost of maybe missing a few houses numbered "A", etc. Also OpenAddresses primarily comes from county GIS servers, etc. which use a variety of database schemas and don't always handle NULLs very well. Again, while a single zero is a valid house number, in OpenAddresses it's more likely an error While a single zero is a valid house number, more than one zero is not, or at least not in OpenAddresses ''' try: house_number = int(house_number.strip()) return house_number > 0 except (ValueError, TypeError): house_number = house_number.strip() return house_number and ( is_numeric(house_number) or fraction_regex.match(house_number) or number_space_letter_regex.match(house_number) or number_slash_number_regex.match(house_number) or number_fraction_regex.match(house_number)) and not all( (c == '0' for c in house_number if c.isdigit())) @classmethod def validate_house_number_sin_numero(cls, house_number): if sin_numero_regex.match(house_number): return True return cls.validate_house_number(house_number) @classmethod def validate_russian_house_number(cls, house_number): if dom_korpus_stroyeniye_regex.match(house_number): return True elif uchastok_regex.match(house_number): return True elif bea_nomera_regex.match(house_number): return True return cls.validate_house_number(house_number) @classmethod def validate_colombian_house_number(cls, house_number): return True @classmethod def validate_chinese_house_number(cls, house_number): if not house_number: return False tokens = tokenize(house_number) if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens): return True return cls.validate_house_number(house_number) component_validators = { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number, AddressFormatter.ROAD: validators.validate_street, AddressFormatter.POSTCODE: validators.validate_postcode, } language_validators = { SPANISH: { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero, }, PORTUGUESE: { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero, }, RUSSIAN: { AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number, }, CHINESE: { AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number, } } country_validators = { Countries.COLOMBIA: { AddressFormatter.HOUSE_NUMBER: validators.validate_colombian_house_number } } chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U) @classmethod def format_chinese_house_number(cls, house_number): if not house_number: return house_number return cls.chinese_annex_regex.sub(u'\\1号', house_number) @classmethod def format_colombian_house_number(cls, house_number): house_number = house_number.strip() match = colombian_standard_house_number_regex.match(house_number) if match: separator = random.choice((u'-', u' - ', u' ')) cross_street, building_number = match.groups() numbers = [] if cross_street and u' ' in cross_street and random.choice( (True, False)): cross_street = cross_street.replace(u' ', u'') if cross_street: numbers.append(cross_street) if building_number and u' ' in building_number and random.choice( (True, False)): building_number = building_number.replace(u' ', u'') if building_number: numbers.append(building_number) if numbers: house_number = separator.join(numbers) house_number_prefixes = (u'#', u'no.', u'no', u'nº') if random.choice((True, False)) and not any( (house_number.lower().startswith(p) for p in house_number_prefixes)): house_number = u' '.join( [random.choice(house_number_prefixes), house_number]) return house_number def get_property(self, key, *configs): for config in configs: value = config.get(key, None) if value is not None: return value return None def cldr_country_name(self, country_code, language, configs): cldr_country_prob = float( self.get_property('cldr_country_probability', *configs)) country_name = None if random.random() < cldr_country_prob: localized, iso_3166, alpha2, alpha3 = values = range(4) localized_prob = float( self.get_property('localized_name_probability', *configs)) iso_3166_prob = float( self.get_property('iso_3166_name_probability', *configs)) alpha2_prob = float( self.get_property('iso_alpha_2_code_probability', *configs)) alpha3_prob = float( self.get_property('iso_alpha_3_code_probability', *configs)) probs = cdf( [localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob]) country_type = weighted_choice(values, probs) country_name = country_code.upper() if country_type == localized: country_name = country_names.localized_name( country_code, language) or country_names.localized_name( country_code) or country_name elif country_type == iso_3166: country_name = country_names.iso3166_name(country_code) elif country_type == alpha3: country_name = country_names.alpha3_code( country_code) or country_name return country_name @classmethod def cleanup_number(cls, num, strip_commas=False): num = num.strip() if strip_commas: num = num.replace(six.u(','), six.u('')) try: num_int = int(num) except (ValueError, TypeError): try: num_float = float(num) leading_zeros = 0 for c in num: if c == six.u('0'): leading_zeros += 1 else: break num = safe_decode(int(num_float)) if leading_zeros: num = six.u('{}{}').format(six.u('0') * leading_zeros, num) except (ValueError, TypeError): pass return num @classmethod def fix_component_encodings(cls, components): return { k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components) } def formatted_addresses(self, country_dir, path, configs, tag_components=True): abbreviate_street_prob = float( self.get_property('abbreviate_street_probability', *configs)) separate_street_prob = float( self.get_property('separate_street_probability', *configs) or 0.0) abbreviate_unit_prob = float( self.get_property('abbreviate_unit_probability', *configs)) separate_unit_prob = float( self.get_property('separate_unit_probability', *configs) or 0.0) abbreviate_toponym_prob = float( self.get_property('abbreviate_toponym_probability', *configs)) add_osm_boundaries = bool( self.get_property('add_osm_boundaries', *configs) or False) add_osm_neighborhoods = bool( self.get_property('add_osm_neighborhoods', *configs) or False) osm_neighborhood_overrides_city = self.get_property( 'osm_neighborhood_overrides_city', *configs) non_numeric_units = bool( self.get_property('non_numeric_units', *configs) or False) house_number_strip_commas = bool( self.get_property('house_number_strip_commas', *configs) or False) numeric_postcodes_only = bool( self.get_property('numeric_postcodes_only', *configs) or False) postcode_strip_non_digit_chars = bool( self.get_property('postcode_strip_non_digit_chars', *configs) or False) address_only_probability = float( self.get_property('address_only_probability', *configs)) place_only_probability = float( self.get_property('place_only_probability', *configs)) place_and_postcode_probability = float( self.get_property('place_and_postcode_probability', *configs)) city_replacements = self.get_property('city_replacements', *configs) override_country_dir = self.get_property('override_country_dir', *configs) postcode_length = int( self.get_property('postcode_length', *configs) or 0) drop_address_probability = place_only_probability + place_and_postcode_probability ignore_rows_missing_fields = set( self.get_property('ignore_rows_missing_fields', *configs) or []) ignore_fields_containing = { field: re.compile( six.u('|').join( [six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE) for field, value in six.iteritems( dict( self.get_property('ignore_fields_containing', *configs) or {})) } alias_fields_containing = { field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value] for field, value in six.iteritems( dict( self.get_property('alias_fields_containing', *configs) or {})) } config_language = self.get_property('language', *configs) add_components = self.get_property('add', *configs) fields = self.get_property('fields', *configs) if not fields: return field_map = { field_name: f['component'] for field_name, f in six.iteritems(fields) } mapped_values = { f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get') } f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = { i: field_map[k] for i, k in enumerate(headers) if k in field_map } latitude_index = headers.index('LAT') longitude_index = headers.index('LON') # Clear cached polygons self.components.osm_admin_rtree.clear_cache() self.components.neighborhoods_rtree.clear_cache() for row in reader: try: latitude = float(row[latitude_index]) longitude = float(row[longitude_index]) except (ValueError, TypeError): continue language = config_language components = {} skip_record = False for i, key in six.iteritems(header_indices): value = row[i].strip() if not value and key in ignore_rows_missing_fields: skip_record = True break elif not value: continue if key in mapped_values: value = mapped_values[key].get(value, value) if key == AddressFormatter.ROAD and language == SPANISH: value = self.components.spanish_street_name(value) if key == AddressFormatter.POSTCODE: value = self.cleanup_number(value) if postcode_strip_non_digit_chars: value = six.u('').join( (c for c in value if c.isdigit())) if value and not is_numeric( value) and numeric_postcodes_only: continue else: if postcode_length: value = value.zfill( postcode_length)[:postcode_length] if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE: if add_osm_boundaries: continue value = self.components.cleaned_name( value, first_comma_delimited_phrase=True) if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)): continue if not_applicable_regex.match(value) or null_regex.match( value) or unknown_regex.match(value): continue for exp, sub_val in self.field_regex_replacements.get(key, []): value = exp.sub(sub_val, value) for exp, sub_val in self.field_regex_replacements.get( None, []): value = exp.sub(sub_val, value) value = value.strip(', -') validator = self.country_validators.get(country_dir, {}).get( key, self.language_validators.get(language, {}).get( key, self.component_validators.get(key, None))) if validator is not None and not validator(value): continue if key in ignore_fields_containing and ignore_fields_containing[ key].search(value): continue for (pattern, alias) in alias_fields_containing.get(key, []): if pattern.search(value): if 'component' in alias: key = alias['component'] if value: components[key] = value if skip_record: continue if components: country, candidate_languages = self.country_rtree.country_and_languages( latitude, longitude) if not (country and candidate_languages) or ( country != country_dir and not override_country_dir): country = country_dir candidate_languages = get_country_languages(country) if not candidate_languages: continue candidate_languages = candidate_languages.items() components = self.fix_component_encodings(components) if language is None: language = AddressComponents.address_language( components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if language == UNKNOWN_LANGUAGE: strip_unit_language = candidate_languages[0][ 0] if candidate_languages else None else: strip_unit_language = language street = self.components.strip_unit_phrases_for_language( street, strip_unit_language) street = abbreviate(street_types_gazetteer, street, language, abbreviate_prob=abbreviate_street_prob, separate_prob=separate_street_prob) components[AddressFormatter.ROAD] = street house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number( house_number, strip_commas=house_number_strip_commas) if language == CHINESE: house_number = self.format_chinese_house_number( house_number) if country_dir == Countries.COLOMBIA: house_number = self.format_colombian_house_number( house_number) if house_number is not None: components[ AddressFormatter.HOUSE_NUMBER] = house_number unit = components.get(AddressFormatter.UNIT, None) street_required = country not in ( Countries.JAPAN, Countries.CZECH_REPUBLIC ) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES postcode = components.get(AddressFormatter.POSTCODE, None) if postcode: components[AddressFormatter. POSTCODE] = PostalCodes.add_country_code( postcode, country) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if (not street and street_required) or ( street and house_number and (street.lower() == house_number.lower())) or ( unit and street and street.lower() == unit.lower()): if not postcode: continue components = self.components.drop_address(components) # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc. unit = components.get(AddressFormatter.UNIT, None) if unit is not None: if is_numeric_strict(unit): unit = Unit.phrase(unit, language, country=country) elif non_numeric_units: unit = abbreviate(unit_types_gazetteer, unit, language, abbreviate_prob=abbreviate_unit_prob, separate_prob=separate_unit_prob) else: unit = None if unit is not None: components[AddressFormatter.UNIT] = unit else: components.pop(AddressFormatter.UNIT) unit = None # CLDR country name country_name = self.cldr_country_name(country, language, configs) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate( toponym_abbreviations_gazetteer, component, language, abbreviate_prob=abbreviate_toponym_prob) component = self.components.name_hyphens(component) components[component_key] = component # Any components specified to be added by the config (usually state) if add_components: for k, v in six.iteritems(add_components): if k not in components: components[k] = v # Get named states occasionally, added component is usually a state code address_state = self.components.state_name( components, country, language) if address_state: components[AddressFormatter.STATE] = address_state state = components.get(AddressFormatter.STATE) if state: state = self.components.abbreviated_state( state, country, language) if state: components[AddressFormatter.STATE] = state # This is expensive, so only turn on for files that don't supply their own city names # or for which those names are flawed osm_components = [] # Using population=0 instead of None means if there's no known population or # we don't need to add OSM components, we assume the population of the town is # very small and the place name shouldn't be used unqualified (i.e. needs information # like state name to disambiguate it) population = 0 unambiguous_city = False if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components( latitude, longitude) self.components.add_admin_boundaries( components, osm_components, country, language, latitude, longitude) categorized = self.components.categorized_osm_components( country, osm_components) for component, label in categorized: if label == AddressFormatter.CITY: unambiguous_city = self.components.unambiguous_wikipedia( component, language) if 'population' in component: population = component['population'] break if AddressFormatter.CITY not in components and city_replacements: components.update({ k: v for k, v in six.iteritems(city_replacements) if k not in components }) # The neighborhood index is cheaper so can turn on for whole countries neighborhood_components = [] if add_osm_neighborhoods: neighborhood_components = self.components.neighborhood_components( latitude, longitude) self.components.add_neighborhoods( components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city) self.components.cleanup_boundary_names(components) self.components.country_specific_cleanup(components, country) self.components.replace_name_affixes(components, language, country=country) self.components.replace_names(components) self.components.prune_duplicate_names(components) self.components.remove_numeric_boundary_names(components) self.components.add_house_number_phrase(components, language, country=country) self.components.add_postcode_phrase(components, language, country=country) # Component dropout all_osm_components = osm_components + neighborhood_components components = place_config.dropout_components( components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city) self.components.add_genitives(components, language) formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < address_only_probability and street: address_only_components = self.components.drop_places( components) address_only_components = self.components.drop_postcode( address_only_components) formatted = self.formatter.format_address( address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < drop_address_probability: components = self.components.drop_address(components) if rand_val < place_and_postcode_probability: components = self.components.drop_postcode(components) if components and (len(components) > 1 or add_osm_boundaries): formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None): all_sources_valid = sources_only is None valid_sources = set() if not all_sources_valid: for source in sources_only: if source.startswith(base_dir): source = os.path.relpath(source, base_dir) parts = source.strip('/ ').split('/') if len(parts) > 3: raise AssertionError( 'Sources may only have at maximum 3 parts') valid_sources.add(tuple(parts)) if tag_components: formatted_tagged_file = open( os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_tagged_file = open( os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') i = 0 for country_dir in sorted(openaddresses_config.country_configs.keys()): country_config = openaddresses_config.country_configs[country_dir] # Clear country cache for each new country self.country_rtree.clear_cache() for file_config in country_config.get('files', []): filename = file_config['filename'] if not all_sources_valid and not ( (country_dir, filename) in valid_sources or (country_dir, ) in valid_sources): continue print(six.u('doing {}/{}').format(country_dir, filename)) path = os.path.join(base_dir, country_dir, filename) configs = (file_config, country_config, openaddresses_config.config) for language, country, formatted_address in self.formatted_addresses( country_dir, path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue if tag_components: row = (language, country, formatted_address) else: row = (formatted_address, ) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i)) if self.debug: break for subdir in sorted(country_config.get('subdirs', {}).keys()): subdir_config = country_config['subdirs'][subdir] subdir = safe_decode(subdir) for file_config in subdir_config.get('files', []): filename = file_config['filename'] if not all_sources_valid and not ( (country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir, ) in valid_sources): continue print( six.u('doing {}/{}/{}').format(country_dir, subdir, filename)) path = os.path.join(base_dir, country_dir, subdir, filename) configs = (file_config, subdir_config, country_config, openaddresses_config.config) for language, country, formatted_address in self.formatted_addresses( country_dir, path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip( ): continue formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip( ): continue if tag_components: row = (language, country, formatted_address) else: row = (formatted_address, ) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i)) if self.debug: break
class OpenAddressesFormatter(object): field_regex_replacements = { # All fields None: [ (re.compile('<\s*null\s*>', re.I), u''), (re.compile('[\s]{2,}'), six.u(' ')), (re.compile('\`'), u"'"), (re.compile('\-?\*'), u""), ], AddressFormatter.HOUSE_NUMBER: [ # Most of the house numbers in Montreal start with "#" (re.compile('^#', re.UNICODE), u''), # Some house numbers have multiple hyphens (re.compile('[\-]{2,}'), u'-'), # Some house number ranges are split up like "12 -14" (re.compile('[\s]*\-[\s]*'), u'-'), ] } unit_type_regexes = {} for (lang, dictionary_type), values in six.iteritems(address_phrase_dictionaries.phrases): if dictionary_type == 'unit_types_numbered': unit_phrases = [safe_encode(p) for p in itertools.chain(*values) if len(p) > 2] pattern = re.compile(r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'.format(safe_encode('|').join(unit_phrases)), re.I | re.UNICODE) unit_type_regexes[lang] = pattern def __init__(self, components, country_rtree, debug=False): self.components = components self.country_rtree = country_rtree self.debug = debug self.formatter = AddressFormatter() class validators: @classmethod def validate_postcode(cls, postcode): ''' Postcodes that are all zeros are improperly-formatted NULL values ''' return not all((c in ('0', '-', '.', ' ', ',') for c in postcode)) @classmethod def validate_street(cls, street): ''' Streets should not be simple numbers. If they are it's probably a copy/paste error and should be the house number. ''' return not is_numeric(street) @classmethod def validate_house_number(cls, house_number): ''' House number doesn't necessarily have to be numeric, but in some of the OpenAddresses data sets the house number field is equal to the capitalized street name, so this at least provides protection against insane values for house number at the cost of maybe missing a few houses numbered "A", etc. Also OpenAddresses primarily comes from county GIS servers, etc. which use a variety of database schemas and don't always handle NULLs very well. Again, while a single zero is a valid house number, in OpenAddresses it's more likely an error While a single zero is a valid house number, more than one zero is not, or at least not in OpenAddresses ''' try: house_number = int(house_number.strip()) return house_number > 0 except (ValueError, TypeError): house_number = house_number.strip() return house_number and (is_numeric(house_number) or fraction_regex.match(house_number) or number_space_letter_regex.match(house_number) or number_slash_number_regex.match(house_number) or number_fraction_regex.match(house_number)) and not all((c == '0' for c in house_number if c.isdigit())) @classmethod def validate_house_number_sin_numero(cls, house_number): if sin_numero_regex.match(house_number): return True return cls.validate_house_number(house_number) @classmethod def validate_russian_house_number(cls, house_number): if dom_korpus_stroyeniye_regex.match(house_number): return True elif uchastok_regex.match(house_number): return True elif bea_nomera_regex.match(house_number): return True return cls.validate_house_number(house_number) @classmethod def validate_colombian_house_number(cls, house_number): return True @classmethod def validate_chinese_house_number(cls, house_number): if not house_number: return False tokens = tokenize(house_number) if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens): return True return cls.validate_house_number(house_number) component_validators = { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number, AddressFormatter.ROAD: validators.validate_street, AddressFormatter.POSTCODE: validators.validate_postcode, } language_validators = { SPANISH: { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero, }, PORTUGUESE: { AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero, }, RUSSIAN: { AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number, }, CHINESE: { AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number, } } country_validators = { Countries.COLOMBIA: { AddressFormatter.HOUSE_NUMBER: validators.validate_colombian_house_number } } chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U) @classmethod def format_chinese_house_number(cls, house_number): if not house_number: return house_number return cls.chinese_annex_regex.sub(u'\\1号', house_number) @classmethod def format_colombian_house_number(cls, house_number): house_number = house_number.strip() match = colombian_standard_house_number_regex.match(house_number) if match: separator = random.choice((u'-', u' - ', u' ')) cross_street, building_number = match.groups() numbers = [] if cross_street and u' ' in cross_street and random.choice((True, False)): cross_street = cross_street.replace(u' ', u'') if cross_street: numbers.append(cross_street) if building_number and u' ' in building_number and random.choice((True, False)): building_number = building_number.replace(u' ', u'') if building_number: numbers.append(building_number) if numbers: house_number = separator.join(numbers) house_number_prefixes = (u'#', u'no.', u'no', u'nº') if random.choice((True, False)) and not any((house_number.lower().startswith(p) for p in house_number_prefixes)): house_number = u' '.join([random.choice(house_number_prefixes), house_number]) return house_number def get_property(self, key, *configs): for config in configs: value = config.get(key, None) if value is not None: return value return None def cldr_country_name(self, country_code, language, configs): cldr_country_prob = float(self.get_property('cldr_country_probability', *configs)) country_name = None if random.random() < cldr_country_prob: localized, iso_3166, alpha2, alpha3 = values = range(4) localized_prob = float(self.get_property('localized_name_probability', *configs)) iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs)) alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs)) alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs)) probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob]) country_type = weighted_choice(values, probs) country_name = country_code.upper() if country_type == localized: country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name elif country_type == iso_3166: country_name = country_names.iso3166_name(country_code) elif country_type == alpha3: country_name = country_names.alpha3_code(country_code) or country_name return country_name @classmethod def cleanup_number(cls, num, strip_commas=False): num = num.strip() if strip_commas: num = num.replace(six.u(','), six.u('')) try: num_int = int(num) except (ValueError, TypeError): try: num_float = float(num) leading_zeros = 0 for c in num: if c == six.u('0'): leading_zeros += 1 else: break num = safe_decode(int(num_float)) if leading_zeros: num = six.u('{}{}').format(six.u('0') * leading_zeros, num) except (ValueError, TypeError): pass return num @classmethod def fix_component_encodings(cls, components): return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)} def formatted_addresses(self, country_dir, path, configs, tag_components=True): abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs)) separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0) abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs)) separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0) abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs)) add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False) add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False) osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs) non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False) house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False) numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False) postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False) address_only_probability = float(self.get_property('address_only_probability', *configs)) place_only_probability = float(self.get_property('place_only_probability', *configs)) place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs)) city_replacements = self.get_property('city_replacements', *configs) override_country_dir = self.get_property('override_country_dir', *configs) postcode_length = int(self.get_property('postcode_length', *configs) or 0) drop_address_probability = place_only_probability + place_and_postcode_probability ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or []) ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE) for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))} alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value] for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))} config_language = self.get_property('language', *configs) add_components = self.get_property('add', *configs) fields = self.get_property('fields', *configs) if not fields: return field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)} mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')} f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map} latitude_index = headers.index('LAT') longitude_index = headers.index('LON') # Clear cached polygons self.components.osm_admin_rtree.clear_cache() self.components.neighborhoods_rtree.clear_cache() for row in reader: try: latitude = float(row[latitude_index]) longitude = float(row[longitude_index]) except (ValueError, TypeError): continue language = config_language components = {} skip_record = False for i, key in six.iteritems(header_indices): value = row[i].strip() if not value and key in ignore_rows_missing_fields: skip_record = True break elif not value: continue if key in mapped_values: value = mapped_values[key].get(value, value) if key == AddressFormatter.ROAD and language == SPANISH: value = self.components.spanish_street_name(value) if key == AddressFormatter.POSTCODE: value = self.cleanup_number(value) if postcode_strip_non_digit_chars: value = six.u('').join((c for c in value if c.isdigit())) if value and not is_numeric(value) and numeric_postcodes_only: continue else: if postcode_length: value = value.zfill(postcode_length)[:postcode_length] if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE: if add_osm_boundaries: continue value = self.components.cleaned_name(value, first_comma_delimited_phrase=True) if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)): continue if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value): continue for exp, sub_val in self.field_regex_replacements.get(key, []): value = exp.sub(sub_val, value) for exp, sub_val in self.field_regex_replacements.get(None, []): value = exp.sub(sub_val, value) value = value.strip(', -') validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None))) if validator is not None and not validator(value): continue if key in ignore_fields_containing and ignore_fields_containing[key].search(value): continue for (pattern, alias) in alias_fields_containing.get(key, []): if pattern.search(value): if 'component' in alias: key = alias['component'] if value: components[key] = value if skip_record: continue if components: country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages) or (country != country_dir and not override_country_dir): country = country_dir candidate_languages = get_country_languages(country) if not candidate_languages: continue candidate_languages = candidate_languages.items() components = self.fix_component_encodings(components) if language is None: language = AddressComponents.address_language(components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if language == UNKNOWN_LANGUAGE: strip_unit_language = candidate_languages[0][0] if candidate_languages else None else: strip_unit_language = language street = self.components.strip_unit_phrases_for_language(street, strip_unit_language) street = abbreviate(street_types_gazetteer, street, language, abbreviate_prob=abbreviate_street_prob, separate_prob=separate_street_prob) components[AddressFormatter.ROAD] = street house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas) if language == CHINESE: house_number = self.format_chinese_house_number(house_number) if country_dir == Countries.COLOMBIA: house_number = self.format_colombian_house_number(house_number) if house_number is not None: components[AddressFormatter.HOUSE_NUMBER] = house_number unit = components.get(AddressFormatter.UNIT, None) street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES postcode = components.get(AddressFormatter.POSTCODE, None) if postcode: components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()): if not postcode: continue components = self.components.drop_address(components) # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc. unit = components.get(AddressFormatter.UNIT, None) if unit is not None: if is_numeric_strict(unit): unit = Unit.phrase(unit, language, country=country) elif non_numeric_units: unit = abbreviate(unit_types_gazetteer, unit, language, abbreviate_prob=abbreviate_unit_prob, separate_prob=separate_unit_prob) else: unit = None if unit is not None: components[AddressFormatter.UNIT] = unit else: components.pop(AddressFormatter.UNIT) unit = None # CLDR country name country_name = self.cldr_country_name(country, language, configs) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate(toponym_abbreviations_gazetteer, component, language, abbreviate_prob=abbreviate_toponym_prob) component = self.components.name_hyphens(component) components[component_key] = component # Any components specified to be added by the config (usually state) if add_components: for k, v in six.iteritems(add_components): if k not in components: components[k] = v # Get named states occasionally, added component is usually a state code address_state = self.components.state_name(components, country, language) if address_state: components[AddressFormatter.STATE] = address_state state = components.get(AddressFormatter.STATE) if state: state = self.components.abbreviated_state(state, country, language) if state: components[AddressFormatter.STATE] = state # This is expensive, so only turn on for files that don't supply their own city names # or for which those names are flawed osm_components = [] # Using population=0 instead of None means if there's no known population or # we don't need to add OSM components, we assume the population of the town is # very small and the place name shouldn't be used unqualified (i.e. needs information # like state name to disambiguate it) population = 0 unambiguous_city = False if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude) categorized = self.components.categorized_osm_components(country, osm_components) for component, label in categorized: if label == AddressFormatter.CITY: unambiguous_city = self.components.unambiguous_wikipedia(component, language) if 'population' in component: population = component['population'] break if AddressFormatter.CITY not in components and city_replacements: components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components}) # The neighborhood index is cheaper so can turn on for whole countries neighborhood_components = [] if add_osm_neighborhoods: neighborhood_components = self.components.neighborhood_components(latitude, longitude) self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city) self.components.cleanup_boundary_names(components) self.components.country_specific_cleanup(components, country) self.components.replace_name_affixes(components, language, country=country) self.components.replace_names(components) self.components.prune_duplicate_names(components) self.components.remove_numeric_boundary_names(components) self.components.add_house_number_phrase(components, language, country=country) self.components.add_postcode_phrase(components, language, country=country) # Component dropout all_osm_components = osm_components + neighborhood_components components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city) self.components.add_genitives(components, language) formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < address_only_probability and street: address_only_components = self.components.drop_places(components) address_only_components = self.components.drop_postcode(address_only_components) formatted = self.formatter.format_address(address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < drop_address_probability: components = self.components.drop_address(components) if rand_val < place_and_postcode_probability: components = self.components.drop_postcode(components) if components and (len(components) > 1 or add_osm_boundaries): formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None): all_sources_valid = sources_only is None valid_sources = set() if not all_sources_valid: for source in sources_only: if source.startswith(base_dir): source = os.path.relpath(source, base_dir) parts = source.strip('/ ').split('/') if len(parts) > 3: raise AssertionError('Sources may only have at maximum 3 parts') valid_sources.add(tuple(parts)) if tag_components: formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') i = 0 for country_dir in sorted(openaddresses_config.country_configs.keys()): country_config = openaddresses_config.country_configs[country_dir] # Clear country cache for each new country self.country_rtree.clear_cache() for file_config in country_config.get('files', []): filename = file_config['filename'] if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources): continue print(six.u('doing {}/{}').format(country_dir, filename)) path = os.path.join(base_dir, country_dir, filename) configs = (file_config, country_config, openaddresses_config.config) for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue if tag_components: row = (language, country, formatted_address) else: row = (formatted_address,) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i)) if self.debug: break for subdir in sorted(country_config.get('subdirs', {}).keys()): subdir_config = country_config['subdirs'][subdir] subdir = safe_decode(subdir) for file_config in subdir_config.get('files', []): filename = file_config['filename'] if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources): continue print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename)) path = os.path.join(base_dir, country_dir, subdir, filename) configs = (file_config, subdir_config, country_config, openaddresses_config.config) for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue if tag_components: row = (language, country, formatted_address) else: row = (formatted_address,) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i)) if self.debug: break
class OpenAddressesUKFormatter(object): field_map = { 'pao': AddressFormatter.HOUSE_NUMBER, 'street.name': AddressFormatter.ROAD, 'town.name': AddressFormatter.CITY, 'postcode.name': AddressFormatter.POSTCODE } def __init__(self): self.formatter = AddressFormatter() component_validators = { AddressFormatter.HOUSE_NUMBER: OpenAddressesFormatter.validators.validate_house_number, AddressFormatter.ROAD: OpenAddressesFormatter.validators.validate_street, AddressFormatter.POSTCODE: OpenAddressesFormatter.validators.validate_postcode, } cldr_country_probability = 0.3 address_only_probability = 0.4 drop_address_probability = 0.6 drop_address_and_postcode_probability = 0.1 @classmethod def cleanup_number(cls, num, strip_commas=False): num = num.strip() if strip_commas: num = num.replace(six.u(','), six.u('')) try: num_int = int(num) except (ValueError, TypeError): try: num_float = float(num) leading_zeros = 0 for c in num: if c == six.u('0'): leading_zeros += 1 else: break num = safe_decode(int(num_float)) if leading_zeros: num = six.u('{}{}').format(six.u('0') * leading_zeros, num) except (ValueError, TypeError): pass return num def fix_component_encodings(self, components): return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)} def formatted_addresses(self, path, tag_components=True): country = Countries.UNITED_KINGDOM candidate_languages = get_country_languages(country).items() f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = {i: self.field_map[k] for i, k in enumerate(headers) if k in self.field_map} for row in reader: components = {} for i, key in six.iteritems(header_indices): value = row[i].strip() if not value: continue if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value): continue value = value.strip(', -') validator = self.component_validators.get(key, None) if validator is not None and not validator(value): continue if value: components[key] = value if components: components = self.fix_component_encodings(components) language = AddressComponents.address_language(components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if AddressComponents.street_name_is_valid(street): street = abbreviate(street_types_gazetteer, street, language) components[AddressFormatter.ROAD] = street else: components.pop(AddressFormatter.ROAD) street = None house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number(house_number, strip_commas=True) if house_number is not None: components[AddressFormatter.HOUSE_NUMBER] = house_number postcode = components.get(AddressFormatter.POSTCODE, None) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if not street or (street and house_number and (street.lower() == house_number.lower())): if not postcode: continue components = AddressComponents.drop_address(components) country_name = AddressComponents.cldr_country_name(country, language) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate(toponym_abbreviations_gazetteer, component, language) component = AddressComponents.name_hyphens(component) components[component_key] = component AddressComponents.replace_names(components) AddressComponents.prune_duplicate_names(components) AddressComponents.remove_numeric_boundary_names(components) AddressComponents.add_house_number_phrase(components, language, country=country) # Component dropout components = place_config.dropout_components(components, country=country) formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < self.address_only_probability and street: address_only_components = AddressComponents.drop_places(components) address_only_components = AddressComponents.drop_postcode(address_only_components) formatted = self.formatter.format_address(address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < self.drop_address_probability: components = AddressComponents.drop_address(components) if rand_val < self.drop_address_and_postcode_probability: components = AddressComponents.drop_postcode(components) if components and (len(components) > 1): formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) def build_training_data(self, infile, out_dir, tag_components=True): if tag_components: formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') i = 0 for language, country, formatted_address in self.formatted_addresses(infile, tag_components=tag_components): if tag_components: row = (language, country, formatted_address) else: row = (formatted_address,) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i))
class GeoPlanetFormatter(object): # Map of GeoPlanet language codes to ISO-639 alpha2 language codes language_codes = { 'ENG': 'en', 'JPN': 'ja', 'GER': 'de', 'SPA': 'es', 'FRE': 'fr', 'UNK': 'unk', 'ITA': 'it', 'POR': 'pt', 'POL': 'pl', 'ARA': 'ar', 'CZE': 'cs', 'SWE': 'sv', 'CHI': 'zh', 'RUM': 'ro', 'FIN': 'fi', 'DUT': 'nl', 'NOR': 'nb', 'DAN': 'da', 'HUN': 'hu', 'KOR': 'kr', } non_latin_script_languages = { 'JPN', # Japanese 'ARA', # Arabic 'CHI', # Chinese 'KOR', # Korean } ALIAS_PREFERRED = 'P' ALIAS_PREFERRED_FOREIGN = 'Q' ALIAS_VARIANT = 'V' ALIAS_ABBREVIATED = 'A' ALIAS_COLLOQUIAL = 'S' # Map of GeoPlanet place types to address formatter types place_types = { 'Continent': AddressFormatter.WORLD_REGION, 'Country': AddressFormatter.COUNTRY, 'CountryRegion': AddressFormatter.COUNTRY_REGION, 'State': AddressFormatter.STATE, 'County': AddressFormatter.STATE_DISTRICT, 'Island': AddressFormatter.ISLAND, 'Town': AddressFormatter.CITY, # Note: if we do general place queris from GeoPlanet, this # may have to be mapped more carefully 'LocalAdmin': AddressFormatter.CITY_DISTRICT, 'Suburb': AddressFormatter.SUBURB, } def __init__(self, geoplanet_db): self.db = sqlite3.connect(geoplanet_db) # These aren't too large and it's easier to have them in memory self.places = { row[0]: row[1:] for row in self.db.execute('select * from places') } self.aliases = defaultdict(list) self.coterminous_admins = {} self.admins_with_ambiguous_city = set() print('Doing admin ambiguities') for row in self.db.execute('''select p.id, (select count(*) from places where parent_id = p.id) as num_places, (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns, p2.id from places p join places p2 on p2.parent_id = p.id and p.name = p2.name and p.place_type != "Town" and p2.place_type = "Town" group by p.id'''): place_id, num_places, num_towns, coterminous_town_id = row num_places = int(num_places) num_towns = int(num_towns) if num_places == 1 and num_towns == 1: self.coterminous_admins[place_id] = coterminous_town_id self.admins_with_ambiguous_city.add(place_id) print('num coterminous: {}'.format(len(self.coterminous_admins))) print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city))) print('Doing aliases') for row in self.db.execute('''select a.* from aliases a left join places p on a.id = p.id and p.place_type in ("State", "County") and a.language != p.language where name_type != "S" -- no colloquial aliases like "The Big Apple" and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK and p.id is NULL -- exclude foreign-language states/county names order by id, language, case name_type when "P" then 1 when "Q" then 2 when "V" then 3 when "A" then 4 when "S" then 5 else 6 end'''): place = self.places.get(row[0]) if not place: continue self.aliases[row[0]].append(row[1:]) print('Doing variant aliases') variant_aliases = 0 for i, row in enumerate( self.db.execute( '''select a.*, p.name, p.country_code from aliases a join places p using(id) where a.name_type = "V" and a.language = p.language''' )): place_name, country_code = row[-2:] country = country_code.lower() row = row[:-2] place_id, alias, name_type, language = row language = self.language_codes[language] if language != 'unk': alias_sans_affixes = name_affixes.replace_affixes( alias, language, country=country) if alias_sans_affixes: alias = alias_sans_affixes place_name_sans_affixes = name_affixes.replace_affixes( place_name, language, country=country) if place_name_sans_affixes: place_name = place_name_sans_affixes else: language = None if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language): self.aliases[row[0]].append(row[1:]) variant_aliases += 1 if i % 10000 == 0 and i > 0: print('tested {} variant aliases with {} positives'.format( i, variant_aliases)) self.aliases = dict(self.aliases) self.formatter = AddressFormatter() def get_place_hierarchy(self, place_id): all_places = [] original_place_id = place_id place = self.places[place_id] all_places.append((place_id, ) + place) place_id = place[-1] while place_id != 1 and place_id != original_place_id: place = self.places[place_id] all_places.append((place_id, ) + place) place_id = place[-1] return all_places def get_aliases(self, place_id): return self.aliases.get(place_id, []) def cleanup_name(self, name): return name.strip(' ,-') def format_postal_codes(self, tag_components=True): all_postal_codes = self.db.execute('select * from postal_codes') for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes: country = country.lower() postcode_language = language language = self.language_codes[language] if len(postal_code) <= 3: postcode_regex = postcode_regexes.get(country) valid_postcode = False if postcode_regex: match = postcode_regex.match(postal_code) if match and match.end() == len(postal_code): valid_postcode = True if not valid_postcode: continue # If the county/state is coterminous with a city and contains only one place, # set the parent_id to the city instead if parent_id in self.coterminous_admins: parent_id = self.coterminous_admins[parent_id] place_hierarchy = self.get_place_hierarchy(parent_id) containing_places = defaultdict(set) language_places = {None: containing_places} original_language = language have_default_language = False if place_hierarchy: base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0] base_place_type = self.place_types[base_place_type] else: base_place_id = None base_place_type = None place_types_seen = set() for place_id, country, name, lang, place_type, parent in place_hierarchy: country = country.lower() # First language if not have_default_language and lang != postcode_language: language = self.language_codes[lang] have_default_language = True place_type = self.place_types[place_type] if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city: continue name = self.cleanup_name(name) containing_places[place_type].add(name) aliases = self.get_aliases(place_id) for name, name_type, alias_lang in aliases: if not alias_lang: alias_lang = 'UNK' if alias_lang == lang and lang != 'UNK': alias_language = None else: alias_language = self.language_codes[alias_lang] language_places.setdefault(alias_language, defaultdict(set)) lang_places = language_places[alias_language] name = self.cleanup_name(name) lang_places[place_type].add(name) place_types_seen.add(place_type) default_city_names = set([ name.lower() for name in language_places.get(None, {}).get( AddressFormatter.CITY, []) ]) for language, containing_places in six.iteritems(language_places): if language is None: language = original_language country_localized_name = country_names.localized_name( country, language) if country_localized_name: containing_places[AddressFormatter.COUNTRY].add( country_localized_name) country_alpha3_code = country_names.alpha3_code(country) if country_alpha3_code and language in (None, 'ENG'): containing_places[AddressFormatter.COUNTRY].add( country_alpha3_code) keys = containing_places.keys() all_values = containing_places.values() keys_set = set(keys) for i, values in enumerate(itertools.product(*all_values)): components = {AddressFormatter.POSTCODE: postal_code} if not default_city_names: components.update(zip(keys, values)) else: for k, v in zip(keys, values): if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower( ) not in default_city_names: components[k] = v format_language = language if self.formatter.template_language_matters( country, language) else None formatted = self.formatter.format_address( components, country, language=format_language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) component_keys = set(components) components = place_config.dropout_components( components, (), country=country, population=0) if len(components) > 1 and set( components) ^ component_keys: formatted = self.formatter.format_address( components, country, language=format_language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) def build_training_data(self, out_dir, tag_components=True): if tag_components: formatted_tagged_file = open( os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_tagged_file = open( os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') i = 0 for language, country, formatted_address in self.format_postal_codes( tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue if tag_components: row = (language, country, formatted_address) else: row = (formatted_address, ) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i))
class GeoPlanetFormatter(object): # Map of GeoPlanet language codes to ISO-639 alpha2 language codes language_codes = { 'ENG': 'en', 'JPN': 'ja', 'GER': 'de', 'SPA': 'es', 'FRE': 'fr', 'UNK': 'unk', 'ITA': 'it', 'POR': 'pt', 'POL': 'pl', 'ARA': 'ar', 'CZE': 'cs', 'SWE': 'sv', 'CHI': 'zh', 'RUM': 'ro', 'FIN': 'fi', 'DUT': 'nl', 'NOR': 'nb', 'DAN': 'da', 'HUN': 'hu', 'KOR': 'kr', } non_latin_script_languages = { 'JPN', # Japanese 'ARA', # Arabic 'CHI', # Chinese 'KOR', # Korean } ALIAS_PREFERRED = 'P' ALIAS_PREFERRED_FOREIGN = 'Q' ALIAS_VARIANT = 'V' ALIAS_ABBREVIATED = 'A' ALIAS_COLLOQUIAL = 'S' # Map of GeoPlanet place types to address formatter types place_types = { 'Continent': AddressFormatter.WORLD_REGION, 'Country': AddressFormatter.COUNTRY, 'CountryRegion': AddressFormatter.COUNTRY_REGION, 'State': AddressFormatter.STATE, 'County': AddressFormatter.STATE_DISTRICT, 'Island': AddressFormatter.ISLAND, 'Town': AddressFormatter.CITY, # Note: if we do general place queris from GeoPlanet, this # may have to be mapped more carefully 'LocalAdmin': AddressFormatter.CITY_DISTRICT, 'Suburb': AddressFormatter.SUBURB, } def __init__(self, geoplanet_db): self.db = sqlite3.connect(geoplanet_db) # These aren't too large and it's easier to have them in memory self.places = {row[0]: row[1:] for row in self.db.execute('select * from places')} self.aliases = defaultdict(list) self.coterminous_admins = {} self.admins_with_ambiguous_city = set() print('Doing admin ambiguities') for row in self.db.execute('''select p.id, (select count(*) from places where parent_id = p.id) as num_places, (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns, p2.id from places p join places p2 on p2.parent_id = p.id and p.name = p2.name and p.place_type != "Town" and p2.place_type = "Town" group by p.id'''): place_id, num_places, num_towns, coterminous_town_id = row num_places = int(num_places) num_towns = int(num_towns) if num_places == 1 and num_towns == 1: self.coterminous_admins[place_id] = coterminous_town_id self.admins_with_ambiguous_city.add(place_id) print('num coterminous: {}'.format(len(self.coterminous_admins))) print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city))) print('Doing aliases') for row in self.db.execute('''select a.* from aliases a left join places p on a.id = p.id and p.place_type in ("State", "County") and a.language != p.language where name_type != "S" -- no colloquial aliases like "The Big Apple" and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK and p.id is NULL -- exclude foreign-language states/county names order by id, language, case name_type when "P" then 1 when "Q" then 2 when "V" then 3 when "A" then 4 when "S" then 5 else 6 end'''): place = self.places.get(row[0]) if not place: continue self.aliases[row[0]].append(row[1:]) print('Doing variant aliases') variant_aliases = 0 for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a join places p using(id) where a.name_type = "V" and a.language = p.language''')): place_name, country_code = row[-2:] country = country_code.lower() row = row[:-2] place_id, alias, name_type, language = row language = self.language_codes[language] if language != 'unk': alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country) if alias_sans_affixes: alias = alias_sans_affixes place_name_sans_affixes = name_affixes.replace_affixes(place_name, language, country=country) if place_name_sans_affixes: place_name = place_name_sans_affixes else: language = None if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language): self.aliases[row[0]].append(row[1:]) variant_aliases += 1 if i % 10000 == 0 and i > 0: print('tested {} variant aliases with {} positives'.format(i, variant_aliases)) self.aliases = dict(self.aliases) self.formatter = AddressFormatter() def get_place_hierarchy(self, place_id): all_places = [] original_place_id = place_id place = self.places[place_id] all_places.append((place_id, ) + place) place_id = place[-1] while place_id != 1 and place_id != original_place_id: place = self.places[place_id] all_places.append((place_id,) + place) place_id = place[-1] return all_places def get_aliases(self, place_id): return self.aliases.get(place_id, []) def cleanup_name(self, name): return name.strip(' ,-') def format_postal_codes(self, tag_components=True): all_postal_codes = self.db.execute('select * from postal_codes') for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes: country = country.lower() postcode_language = language language = self.language_codes[language] if len(postal_code) <= 3: postcode_regex = postcode_regexes.get(country) valid_postcode = False if postcode_regex: match = postcode_regex.match(postal_code) if match and match.end() == len(postal_code): valid_postcode = True if not valid_postcode: continue # If the county/state is coterminous with a city and contains only one place, # set the parent_id to the city instead if parent_id in self.coterminous_admins: parent_id = self.coterminous_admins[parent_id] place_hierarchy = self.get_place_hierarchy(parent_id) containing_places = defaultdict(set) language_places = {None: containing_places} original_language = language have_default_language = False if place_hierarchy: base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0] base_place_type = self.place_types[base_place_type] else: base_place_id = None base_place_type = None place_types_seen = set() for place_id, country, name, lang, place_type, parent in place_hierarchy: country = country.lower() # First language if not have_default_language and lang != postcode_language: language = self.language_codes[lang] have_default_language = True place_type = self.place_types[place_type] if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city: continue name = self.cleanup_name(name) containing_places[place_type].add(name) aliases = self.get_aliases(place_id) for name, name_type, alias_lang in aliases: if not alias_lang: alias_lang = 'UNK' if alias_lang == lang and lang != 'UNK': alias_language = None else: alias_language = self.language_codes[alias_lang] language_places.setdefault(alias_language, defaultdict(set)) lang_places = language_places[alias_language] name = self.cleanup_name(name) lang_places[place_type].add(name) place_types_seen.add(place_type) default_city_names = set([name.lower() for name in language_places.get(None, {}).get(AddressFormatter.CITY, [])]) for language, containing_places in six.iteritems(language_places): if language is None: language = original_language country_localized_name = country_names.localized_name(country, language) if country_localized_name: containing_places[AddressFormatter.COUNTRY].add(country_localized_name) country_alpha3_code = country_names.alpha3_code(country) if country_alpha3_code and language in (None, 'ENG'): containing_places[AddressFormatter.COUNTRY].add(country_alpha3_code) keys = containing_places.keys() all_values = containing_places.values() keys_set = set(keys) for i, values in enumerate(itertools.product(*all_values)): components = { AddressFormatter.POSTCODE: postal_code } if not default_city_names: components.update(zip(keys, values)) else: for k, v in zip(keys, values): if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower() not in default_city_names: components[k] = v format_language = language if self.formatter.template_language_matters(country, language) else None formatted = self.formatter.format_address(components, country, language=format_language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) component_keys = set(components) components = place_config.dropout_components(components, (), country=country, population=0) if len(components) > 1 and set(components) ^ component_keys: formatted = self.formatter.format_address(components, country, language=format_language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) def build_training_data(self, out_dir, tag_components=True): if tag_components: formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') i = 0 for language, country, formatted_address in self.format_postal_codes(tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue formatted_address = tsv_string(formatted_address) if not formatted_address or not formatted_address.strip(): continue if tag_components: row = (language, country, formatted_address) else: row = (formatted_address,) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i))
class OpenAddressesUKFormatter(object): field_map = { 'pao': AddressFormatter.HOUSE_NUMBER, 'street.name': AddressFormatter.ROAD, 'town.name': AddressFormatter.CITY, 'postcode.name': AddressFormatter.POSTCODE } def __init__(self): self.formatter = AddressFormatter() component_validators = { AddressFormatter.HOUSE_NUMBER: OpenAddressesFormatter.validators.validate_house_number, AddressFormatter.ROAD: OpenAddressesFormatter.validators.validate_street, AddressFormatter.POSTCODE: OpenAddressesFormatter.validators.validate_postcode, } cldr_country_probability = 0.3 address_only_probability = 0.4 drop_address_probability = 0.6 drop_address_and_postcode_probability = 0.1 @classmethod def cleanup_number(cls, num, strip_commas=False): num = num.strip() if strip_commas: num = num.replace(six.u(','), six.u('')) try: num_int = int(num) except (ValueError, TypeError): try: num_float = float(num) leading_zeros = 0 for c in num: if c == six.u('0'): leading_zeros += 1 else: break num = safe_decode(int(num_float)) if leading_zeros: num = six.u('{}{}').format(six.u('0') * leading_zeros, num) except (ValueError, TypeError): pass return num def fix_component_encodings(self, components): return { k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components) } def formatted_addresses(self, path, tag_components=True): country = Countries.UNITED_KINGDOM candidate_languages = get_country_languages(country).items() f = open(path) reader = unicode_csv_reader(f) headers = reader.next() header_indices = { i: self.field_map[k] for i, k in enumerate(headers) if k in self.field_map } for row in reader: components = {} for i, key in six.iteritems(header_indices): value = row[i].strip() if not value: continue if not_applicable_regex.match(value) or null_regex.match( value) or unknown_regex.match(value): continue value = value.strip(', -') validator = self.component_validators.get(key, None) if validator is not None and not validator(value): continue if value: components[key] = value if components: components = self.fix_component_encodings(components) language = AddressComponents.address_language( components, candidate_languages) street = components.get(AddressFormatter.ROAD, None) if street is not None: street = street.strip() street = AddressComponents.cleaned_name(street) if AddressComponents.street_name_is_valid(street): street = abbreviate(street_types_gazetteer, street, language) components[AddressFormatter.ROAD] = street else: components.pop(AddressFormatter.ROAD) street = None house_number = components.get(AddressFormatter.HOUSE_NUMBER, None) if house_number: house_number = self.cleanup_number(house_number, strip_commas=True) if house_number is not None: components[ AddressFormatter.HOUSE_NUMBER] = house_number postcode = components.get(AddressFormatter.POSTCODE, None) # If there's a postcode, we can still use just the city/state/postcode, otherwise discard if not street or (street and house_number and (street.lower() == house_number.lower())): if not postcode: continue components = AddressComponents.drop_address(components) country_name = AddressComponents.cldr_country_name( country, language) if country_name: components[AddressFormatter.COUNTRY] = country_name for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: component = abbreviate(toponym_abbreviations_gazetteer, component, language) component = AddressComponents.name_hyphens(component) components[component_key] = component AddressComponents.replace_names(components) AddressComponents.prune_duplicate_names(components) AddressComponents.remove_numeric_boundary_names(components) AddressComponents.add_house_number_phrase(components, language, country=country) # Component dropout components = place_config.dropout_components(components, country=country) formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) if random.random() < self.address_only_probability and street: address_only_components = AddressComponents.drop_places( components) address_only_components = AddressComponents.drop_postcode( address_only_components) formatted = self.formatter.format_address( address_only_components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) rand_val = random.random() if street and house_number and rand_val < self.drop_address_probability: components = AddressComponents.drop_address(components) if rand_val < self.drop_address_and_postcode_probability: components = AddressComponents.drop_postcode( components) if components and (len(components) > 1): formatted = self.formatter.format_address( components, country, language=language, minimal_only=False, tag_components=tag_components) yield (language, country, formatted) def build_training_data(self, infile, out_dir, tag_components=True): if tag_components: formatted_tagged_file = open( os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') else: formatted_tagged_file = open( os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') i = 0 for language, country, formatted_address in self.formatted_addresses( infile, tag_components=tag_components): if tag_components: row = (language, country, formatted_address) else: row = (formatted_address, ) writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i))