コード例 #1
0
ファイル: formatter.py プロジェクト: rinigus/deb-libpostal
    def __init__(self, components, country_rtree, debug=False):
        self.components = components
        self.country_rtree = country_rtree

        self.debug = debug

        self.formatter = AddressFormatter()
コード例 #2
0
ファイル: formatter.py プロジェクト: BERENZ/libpostal
    def __init__(self, components, country_rtree, debug=False):
        self.components = components
        self.country_rtree = country_rtree

        self.debug = debug

        self.formatter = AddressFormatter()
コード例 #3
0
def build_address_format_training_data_limited(language_rtree, infile, out_dir):
    '''
    Creates a special kind of formatted address training data from OSM's addr:* tags
    but are designed for use in language classification. These records are similar 
    to the untagged formatted records but include the language and country
    (suitable for concatenation with the rest of the language training data),
    and remove several fields like country which usually do not contain helpful
    information for classifying the language.

    Example:

    nb      no      Olaf Ryes Plass 8 | Oslo
    '''
    i = 0

    formatter = AddressFormatter()

    f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w')
    writer = csv.writer(f, 'tsv_no_quote')

    remove_keys = NAME_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS

    for key, value, deps in parse_osm(infile):
        try:
            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
        except Exception:
            continue

        for k in remove_keys:
            _ = value.pop(k, None)

        if not value:
            continue

        country, name_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street')
        if not name_language:
            continue

        single_language = len(name_language) == 1
        for lang, val in name_language.iteritems():
            if lang not in languages:
                continue

            address_dict = value.copy()
            for k in address_dict.keys():
                namespaced_val = u'{}:{}'.format(k, lang)
                if namespaced_val in address_dict:
                    address_dict[k] = address_dict[namespaced_val]
                elif not single_language:
                    address_dict.pop(k)

            if not address_dict:
                continue

            formatted_address_untagged = formatter.format_address(country, address_dict, tag_components=False)
            if formatted_address_untagged is not None:
                formatted_address_untagged = tsv_string(formatted_address_untagged)

                writer.writerow((lang, country, formatted_address_untagged))

        i += 1
        if i % 1000 == 0 and i > 0:
            print 'did', i, 'formatted addresses'
コード例 #4
0
def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, infile, out_dir, tag_components=True):
    '''
    Creates formatted address training data for supervised sequence labeling (or potentially 
    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.

    Example:

    cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country

    The field structure is similar to other training data created by this script i.e.
    {language, country, data}. The data field here is a sequence of labeled tokens similar
    to what we might see in part-of-speech tagging.


    This format uses a special character "|" to denote possible breaks in the input (comma, newline).

    Note that for the address parser, we'd like it to be robust to many different types
    of input, so we may selectively eleminate components

    This information can potentially be used downstream by the sequence model as these
    breaks may be present at prediction time.

    Example:

    sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic

    This may be useful in learning word representations, statistical phrases, morphology
    or other models requiring only the sequence of words.
    '''
    i = 0

    formatter = AddressFormatter()
    osm_address_components.configure()

    if tag_components:
        formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
        writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
    else:
        formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
        writer = csv.writer(formatted_file, 'tsv_no_quote')

    remove_keys = OSM_IGNORE_KEYS

    for node_id, value, deps in parse_osm(infile):
        try:
            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
        except Exception:
            continue

        country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
        if not (country and candidate_languages):
            continue

        for key in remove_keys:
            _ = value.pop(key, None)

        language = None
        if tag_components:
            if len(candidate_languages) == 1:
                language = candidate_languages[0]['lang']
            else:
                street = value.get('addr:street', None)
                if street is not None:
                    language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
                else:
                    language = UNKNOWN_LANGUAGE

        address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases}
        formatter.replace_aliases(address_components)

        address_country = address_components.get(AddressFormatter.COUNTRY)

        '''
        Country names
        -------------

        In OSM, addr:country is almost always an ISO-3166 alpha-2 country code.
        However, we'd like to expand these to include natural language forms
        of the country names we might be likely to encounter in a geocoder or
        handwritten address.

        These splits are somewhat arbitrary but could potentially be fit to data
        from OpenVenues or other sources on the usage of country name forms.

        If the address includes a country, the selection procedure proceeds as follows:

        1. With probability a, select the country name in the language of the address
           (determined above), or with the localized country name if the language is
           undtermined or ambiguous.

        2. With probability b(1-a), sample a language from the distribution of
           languages on the Internet and use the country's name in that language.

        3. This is implicit, but with probability (1-b)(1-a), keep the country code
        '''

        non_local_language = None

        if random.random() < 0.3:
            # 30% of the time: add Quattroshapes country
            address_country = country.upper()

        r = random.random()

        # 1. 60% of the time: use the country name in the current language or the country's local language
        if address_country and r < 0.6:
            localized = None
            if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
                localized = language_country_names.get(language, {}).get(address_country.upper())

            if not localized:
                localized = country_localized_display_name(address_country.lower())

            if localized:
                address_components[AddressFormatter.COUNTRY] = localized
        # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet
        elif address_country and r < 0.7:
            non_local_language = sample_random_language()
            lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper())
            if lang_country:
                address_components[AddressFormatter.COUNTRY] = lang_country
        # 3. Implicit: the rest of the time keep the country code

        '''
        States
        ------

        Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
        whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
        name with the unabbreviated one e.g. CA => California
        '''
        address_state = address_components.get(AddressFormatter.STATE)

        if address_state:
            state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)

            if state_full_name and random.random() < 0.3:
                address_components[AddressFormatter.STATE] = state_full_name

        '''
        OSM boundaries
        --------------

        For many addresses, the city, district, region, etc. are all implicitly
        generated by the reverse geocoder e.g. we do not need an addr:city tag
        to identify that 40.74, -74.00 is in New York City as well as its parent
        geographies (New York county, New York state, etc.)

        Where possible we augment the addr:* tags with some of the reverse-geocoded
        relations from OSM.

        Since addresses found on the web may have the same properties, we
        include these qualifiers in the training data.
        '''

        osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude)
        if osm_components:
            if non_local_language is not None:
                suffix = ':{}'.format(non_local_language)
            else:
                suffix = ''

            name_key = ''.join(('name', suffix))
            raw_name_key = 'name'
            short_name_key = ''.join(('short_name', suffix))
            raw_short_name_key = 'short_name'
            simple_name_key = 'name:simple'
            international_name_key = 'int_name'
            alt_name_key = ''.join(('alt_name', suffix))
            raw_alt_name_key = 'alt_name'
            official_name_key = ''.join(('official_name', suffix))
            raw_official_name_key = 'official_name'
            iso_code_key = 'ISO3166-1:alpha2'
            iso_code3_key = 'ISO3166-1:alpha3'

            poly_components = defaultdict(list)

            for component, components_values in osm_components.iteritems():
                seen = set()

                # Choose which name to use with given probabilities
                r = random.random()
                if r < 0.7:
                    # 70% of the time use the name tag
                    key = name_key
                    raw_key = raw_name_key
                elif r < 0.8:
                    # 10% of the time use the short name
                    key = short_name_key
                    raw_key = raw_short_name_key
                elif r < 0.9:
                    # 10% of the time use the official name
                    key = official_name_key
                    raw_key = raw_official_name_key
                else:
                    # 10% of the time use the official name
                    key = alt_name_key
                    raw_key = raw_alt_name_key

                for component_value in components_values:
                    r = random.random()
                    name = None

                    if iso_code3_key in component_value and r < 0.1:
                        name = component_value[iso_code3_key]
                    elif iso_code_key in component_value and r < 0.3:
                        name = component_value[iso_code_key]
                    elif language == 'en' and not non_local_language and r < 0.7:
                        # Particularly to address the US (prefer United States,
                        # not United States of America) but may capture variations
                        # in other English-speaking countries as well.
                        if simple_name_key in component_value:
                            name = component_value[simple_name_key]
                        elif international_name_key in component_value:
                            name = component_value[international_name_key]

                    if not name:
                        name = component_value.get(key, component_value.get(raw_key))

                    if not name:
                        name = component_value.get(name_key, component_value.get(raw_name_key))

                    if not name:
                        continue

                    if (component, name) not in seen:
                        poly_components[component].append(name)
                        seen.add((component, name))

            for component, vals in poly_components.iteritems():
                if component not in address_components or non_local_language:
                    val = u', '.join(vals)
                    if component == AddressFormatter.STATE and random.random() < 0.7:
                        val = STATE_EXPANSIONS.get(address_country, {}).get(val, val)
                    address_components[component] = val

        '''
        Neighborhoods
        -------------

        In some cities, neighborhoods may be included in a free-text address.

        OSM includes many neighborhoods but only as points, rather than the polygons
        needed to perform reverse-geocoding. We use a hybrid index containing
        Quattroshapes/Zetashapes polygons matched fuzzily with OSM names (which are
        on the whole of better quality).
        '''

        neighborhood = neighborhoods_rtree.point_in_poly(latitude, longitude)
        if neighborhood and AddressFormatter.SUBURB not in address_components:
            address_components[AddressFormatter.SUBURB] = neighborhood['name']

        # Version with all components
        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)

        if tag_components:
            formatted_addresses = []
            formatted_addresses.append(formatted_address)

            address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
            if not address_components:
                continue

            current_components = component_bitset(address_components.keys())

            for component in address_components.keys():
                if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5:
                    address_components.pop(component)
                    current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component]
                    if not address_components:
                        break

                    formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
                    formatted_addresses.append(formatted_address)

            for formatted_address in formatted_addresses:
                if formatted_address and formatted_address.strip():
                    formatted_address = tsv_string(formatted_address)
                    if not formatted_address or not formatted_address.strip():
                        continue
                    row = (language, country, formatted_address)

                    writer.writerow(row)
        elif formatted_address and formatted_address.strip():
            formatted_address = tsv_string(formatted_address)
            writer.writerow([formatted_address])

        i += 1
        if i % 1000 == 0 and i > 0:
            print 'did', i, 'formatted addresses'
コード例 #5
0
ファイル: formatter.py プロジェクト: rinigus/deb-libpostal
class OpenAddressesFormatter(object):
    field_regex_replacements = {
        # All fields
        None: [
            (re.compile('<\s*null\s*>', re.I), u''),
            (re.compile('[\s]{2,}'), six.u(' ')),
            (re.compile('\`'), u"'"),
            (re.compile('\-?\*'), u""),
        ],
        AddressFormatter.HOUSE_NUMBER: [
            # Most of the house numbers in Montreal start with "#"
            (re.compile('^#', re.UNICODE), u''),
            # Some house numbers have multiple hyphens
            (re.compile('[\-]{2,}'), u'-'),
            # Some house number ranges are split up like "12 -14"
            (re.compile('[\s]*\-[\s]*'), u'-'),
        ]
    }

    unit_type_regexes = {}

    for (lang, dictionary_type), values in six.iteritems(
            address_phrase_dictionaries.phrases):
        if dictionary_type == 'unit_types_numbered':
            unit_phrases = [
                safe_encode(p) for p in itertools.chain(*values) if len(p) > 2
            ]
            pattern = re.compile(
                r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'
                .format(safe_encode('|').join(unit_phrases)),
                re.I | re.UNICODE)
            unit_type_regexes[lang] = pattern

    def __init__(self, components, country_rtree, debug=False):
        self.components = components
        self.country_rtree = country_rtree

        self.debug = debug

        self.formatter = AddressFormatter()

    class validators:
        @classmethod
        def validate_postcode(cls, postcode):
            '''
            Postcodes that are all zeros are improperly-formatted NULL values
            '''
            return not all((c in ('0', '-', '.', ' ', ',') for c in postcode))

        @classmethod
        def validate_street(cls, street):
            '''
            Streets should not be simple numbers. If they are it's probably a
            copy/paste error and should be the house number.
            '''
            return not is_numeric(street)

        @classmethod
        def validate_house_number(cls, house_number):
            '''
            House number doesn't necessarily have to be numeric, but in some of the
            OpenAddresses data sets the house number field is equal to the capitalized
            street name, so this at least provides protection against insane values
            for house number at the cost of maybe missing a few houses numbered "A", etc.

            Also OpenAddresses primarily comes from county GIS servers, etc. which use
            a variety of database schemas and don't always handle NULLs very well. Again,
            while a single zero is a valid house number, in OpenAddresses it's more likely
            an error

            While a single zero is a valid house number, more than one zero is not, or
            at least not in OpenAddresses
            '''

            try:
                house_number = int(house_number.strip())
                return house_number > 0
            except (ValueError, TypeError):
                house_number = house_number.strip()
                return house_number and (
                    is_numeric(house_number)
                    or fraction_regex.match(house_number)
                    or number_space_letter_regex.match(house_number)
                    or number_slash_number_regex.match(house_number)
                    or number_fraction_regex.match(house_number)) and not all(
                        (c == '0' for c in house_number if c.isdigit()))

        @classmethod
        def validate_house_number_sin_numero(cls, house_number):
            if sin_numero_regex.match(house_number):
                return True
            return cls.validate_house_number(house_number)

        @classmethod
        def validate_russian_house_number(cls, house_number):
            if dom_korpus_stroyeniye_regex.match(house_number):
                return True
            elif uchastok_regex.match(house_number):
                return True
            elif bea_nomera_regex.match(house_number):
                return True
            return cls.validate_house_number(house_number)

        @classmethod
        def validate_colombian_house_number(cls, house_number):
            return True

        @classmethod
        def validate_chinese_house_number(cls, house_number):
            if not house_number:
                return False
            tokens = tokenize(house_number)

            if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋',
                                                                  u'附'))
                   for t, c in tokens):
                return True
            return cls.validate_house_number(house_number)

    component_validators = {
        AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
        AddressFormatter.ROAD: validators.validate_street,
        AddressFormatter.POSTCODE: validators.validate_postcode,
    }

    language_validators = {
        SPANISH: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_house_number_sin_numero,
        },
        PORTUGUESE: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_house_number_sin_numero,
        },
        RUSSIAN: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_russian_house_number,
        },
        CHINESE: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_chinese_house_number,
        }
    }

    country_validators = {
        Countries.COLOMBIA: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_colombian_house_number
        }
    }

    chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U)

    @classmethod
    def format_chinese_house_number(cls, house_number):
        if not house_number:
            return house_number
        return cls.chinese_annex_regex.sub(u'\\1号', house_number)

    @classmethod
    def format_colombian_house_number(cls, house_number):
        house_number = house_number.strip()
        match = colombian_standard_house_number_regex.match(house_number)
        if match:
            separator = random.choice((u'-', u' - ', u' '))

            cross_street, building_number = match.groups()

            numbers = []
            if cross_street and u' ' in cross_street and random.choice(
                (True, False)):
                cross_street = cross_street.replace(u' ', u'')

            if cross_street:
                numbers.append(cross_street)

            if building_number and u' ' in building_number and random.choice(
                (True, False)):
                building_number = building_number.replace(u' ', u'')

            if building_number:
                numbers.append(building_number)

            if numbers:
                house_number = separator.join(numbers)
                house_number_prefixes = (u'#', u'no.', u'no', u'nº')
                if random.choice((True, False)) and not any(
                    (house_number.lower().startswith(p)
                     for p in house_number_prefixes)):
                    house_number = u' '.join(
                        [random.choice(house_number_prefixes), house_number])

        return house_number

    def get_property(self, key, *configs):
        for config in configs:
            value = config.get(key, None)
            if value is not None:
                return value
        return None

    def cldr_country_name(self, country_code, language, configs):
        cldr_country_prob = float(
            self.get_property('cldr_country_probability', *configs))

        country_name = None

        if random.random() < cldr_country_prob:
            localized, iso_3166, alpha2, alpha3 = values = range(4)
            localized_prob = float(
                self.get_property('localized_name_probability', *configs))
            iso_3166_prob = float(
                self.get_property('iso_3166_name_probability', *configs))
            alpha2_prob = float(
                self.get_property('iso_alpha_2_code_probability', *configs))
            alpha3_prob = float(
                self.get_property('iso_alpha_3_code_probability', *configs))

            probs = cdf(
                [localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])

            country_type = weighted_choice(values, probs)

            country_name = country_code.upper()
            if country_type == localized:
                country_name = country_names.localized_name(
                    country_code, language) or country_names.localized_name(
                        country_code) or country_name
            elif country_type == iso_3166:
                country_name = country_names.iso3166_name(country_code)
            elif country_type == alpha3:
                country_name = country_names.alpha3_code(
                    country_code) or country_name

        return country_name

    @classmethod
    def cleanup_number(cls, num, strip_commas=False):
        num = num.strip()
        if strip_commas:
            num = num.replace(six.u(','), six.u(''))
        try:
            num_int = int(num)
        except (ValueError, TypeError):
            try:
                num_float = float(num)
                leading_zeros = 0
                for c in num:
                    if c == six.u('0'):
                        leading_zeros += 1
                    else:
                        break
                num = safe_decode(int(num_float))
                if leading_zeros:
                    num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
            except (ValueError, TypeError):
                pass
        return num

    @classmethod
    def fix_component_encodings(cls, components):
        return {
            k: ftfy.fix_encoding(safe_decode(v))
            for k, v in six.iteritems(components)
        }

    def formatted_addresses(self,
                            country_dir,
                            path,
                            configs,
                            tag_components=True):
        abbreviate_street_prob = float(
            self.get_property('abbreviate_street_probability', *configs))
        separate_street_prob = float(
            self.get_property('separate_street_probability', *configs) or 0.0)
        abbreviate_unit_prob = float(
            self.get_property('abbreviate_unit_probability', *configs))
        separate_unit_prob = float(
            self.get_property('separate_unit_probability', *configs) or 0.0)
        abbreviate_toponym_prob = float(
            self.get_property('abbreviate_toponym_probability', *configs))

        add_osm_boundaries = bool(
            self.get_property('add_osm_boundaries', *configs) or False)
        add_osm_neighborhoods = bool(
            self.get_property('add_osm_neighborhoods', *configs) or False)
        osm_neighborhood_overrides_city = self.get_property(
            'osm_neighborhood_overrides_city', *configs)
        non_numeric_units = bool(
            self.get_property('non_numeric_units', *configs) or False)
        house_number_strip_commas = bool(
            self.get_property('house_number_strip_commas', *configs) or False)
        numeric_postcodes_only = bool(
            self.get_property('numeric_postcodes_only', *configs) or False)
        postcode_strip_non_digit_chars = bool(
            self.get_property('postcode_strip_non_digit_chars', *configs)
            or False)

        address_only_probability = float(
            self.get_property('address_only_probability', *configs))
        place_only_probability = float(
            self.get_property('place_only_probability', *configs))
        place_and_postcode_probability = float(
            self.get_property('place_and_postcode_probability', *configs))

        city_replacements = self.get_property('city_replacements', *configs)

        override_country_dir = self.get_property('override_country_dir',
                                                 *configs)

        postcode_length = int(
            self.get_property('postcode_length', *configs) or 0)

        drop_address_probability = place_only_probability + place_and_postcode_probability

        ignore_rows_missing_fields = set(
            self.get_property('ignore_rows_missing_fields', *configs) or [])

        ignore_fields_containing = {
            field: re.compile(
                six.u('|').join(
                    [six.u('(?:{})').format(safe_decode(v)) for v in value]),
                re.I | re.UNICODE)
            for field, value in six.iteritems(
                dict(
                    self.get_property('ignore_fields_containing', *configs)
                    or {}))
        }

        alias_fields_containing = {
            field:
            [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
            for field, value in six.iteritems(
                dict(
                    self.get_property('alias_fields_containing', *configs)
                    or {}))
        }

        config_language = self.get_property('language', *configs)

        add_components = self.get_property('add', *configs)

        fields = self.get_property('fields', *configs)
        if not fields:
            return

        field_map = {
            field_name: f['component']
            for field_name, f in six.iteritems(fields)
        }
        mapped_values = {
            f['component']: f['value_map']
            for f in six.itervalues(fields)
            if hasattr(f.get('value_map'), 'get')
        }

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {
            i: field_map[k]
            for i, k in enumerate(headers) if k in field_map
        }
        latitude_index = headers.index('LAT')
        longitude_index = headers.index('LON')

        # Clear cached polygons
        self.components.osm_admin_rtree.clear_cache()
        self.components.neighborhoods_rtree.clear_cache()

        for row in reader:
            try:
                latitude = float(row[latitude_index])
                longitude = float(row[longitude_index])
            except (ValueError, TypeError):
                continue

            language = config_language

            components = {}

            skip_record = False

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value and key in ignore_rows_missing_fields:
                    skip_record = True
                    break
                elif not value:
                    continue

                if key in mapped_values:
                    value = mapped_values[key].get(value, value)

                if key == AddressFormatter.ROAD and language == SPANISH:
                    value = self.components.spanish_street_name(value)

                if key == AddressFormatter.POSTCODE:
                    value = self.cleanup_number(value)

                    if postcode_strip_non_digit_chars:
                        value = six.u('').join(
                            (c for c in value if c.isdigit()))

                    if value and not is_numeric(
                            value) and numeric_postcodes_only:
                        continue
                    else:
                        if postcode_length:
                            value = value.zfill(
                                postcode_length)[:postcode_length]

                if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
                    if add_osm_boundaries:
                        continue
                    value = self.components.cleaned_name(
                        value, first_comma_delimited_phrase=True)
                    if value and ((len(value) < 2
                                   and not get_string_script(value)[0].lower()
                                   in ideographic_scripts)
                                  or is_numeric(value)):
                        continue

                if not_applicable_regex.match(value) or null_regex.match(
                        value) or unknown_regex.match(value):
                    continue

                for exp, sub_val in self.field_regex_replacements.get(key, []):
                    value = exp.sub(sub_val, value)

                for exp, sub_val in self.field_regex_replacements.get(
                        None, []):
                    value = exp.sub(sub_val, value)

                value = value.strip(', -')

                validator = self.country_validators.get(country_dir, {}).get(
                    key,
                    self.language_validators.get(language, {}).get(
                        key, self.component_validators.get(key, None)))

                if validator is not None and not validator(value):
                    continue

                if key in ignore_fields_containing and ignore_fields_containing[
                        key].search(value):
                    continue

                for (pattern, alias) in alias_fields_containing.get(key, []):
                    if pattern.search(value):
                        if 'component' in alias:
                            key = alias['component']

                if value:
                    components[key] = value

            if skip_record:
                continue

            if components:
                country, candidate_languages = self.country_rtree.country_and_languages(
                    latitude, longitude)
                if not (country and candidate_languages) or (
                        country != country_dir and not override_country_dir):
                    country = country_dir
                    candidate_languages = get_country_languages(country)
                    if not candidate_languages:
                        continue
                    candidate_languages = candidate_languages.items()

                components = self.fix_component_encodings(components)

                if language is None:
                    language = AddressComponents.address_language(
                        components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)

                    if language == UNKNOWN_LANGUAGE:
                        strip_unit_language = candidate_languages[0][
                            0] if candidate_languages else None
                    else:
                        strip_unit_language = language

                    street = self.components.strip_unit_phrases_for_language(
                        street, strip_unit_language)

                    street = abbreviate(street_types_gazetteer,
                                        street,
                                        language,
                                        abbreviate_prob=abbreviate_street_prob,
                                        separate_prob=separate_street_prob)
                    components[AddressFormatter.ROAD] = street

                house_number = components.get(AddressFormatter.HOUSE_NUMBER,
                                              None)
                if house_number:
                    house_number = self.cleanup_number(
                        house_number, strip_commas=house_number_strip_commas)

                    if language == CHINESE:
                        house_number = self.format_chinese_house_number(
                            house_number)

                    if country_dir == Countries.COLOMBIA:
                        house_number = self.format_colombian_house_number(
                            house_number)

                    if house_number is not None:
                        components[
                            AddressFormatter.HOUSE_NUMBER] = house_number

                unit = components.get(AddressFormatter.UNIT, None)

                street_required = country not in (
                    Countries.JAPAN, Countries.CZECH_REPUBLIC
                ) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES

                postcode = components.get(AddressFormatter.POSTCODE, None)

                if postcode:
                    components[AddressFormatter.
                               POSTCODE] = PostalCodes.add_country_code(
                                   postcode, country)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if (not street and street_required) or (
                        street and house_number and
                    (street.lower() == house_number.lower())) or (
                        unit and street and street.lower() == unit.lower()):
                    if not postcode:
                        continue
                    components = self.components.drop_address(components)

                # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
                unit = components.get(AddressFormatter.UNIT, None)

                if unit is not None:
                    if is_numeric_strict(unit):
                        unit = Unit.phrase(unit, language, country=country)
                    elif non_numeric_units:
                        unit = abbreviate(unit_types_gazetteer,
                                          unit,
                                          language,
                                          abbreviate_prob=abbreviate_unit_prob,
                                          separate_prob=separate_unit_prob)
                    else:
                        unit = None

                    if unit is not None:
                        components[AddressFormatter.UNIT] = unit
                    else:
                        components.pop(AddressFormatter.UNIT)
                        unit = None

                # CLDR country name
                country_name = self.cldr_country_name(country, language,
                                                      configs)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(
                            toponym_abbreviations_gazetteer,
                            component,
                            language,
                            abbreviate_prob=abbreviate_toponym_prob)
                        component = self.components.name_hyphens(component)
                        components[component_key] = component

                # Any components specified to be added by the config (usually state)
                if add_components:
                    for k, v in six.iteritems(add_components):
                        if k not in components:
                            components[k] = v

                # Get named states occasionally, added component is usually a state code
                address_state = self.components.state_name(
                    components, country, language)
                if address_state:
                    components[AddressFormatter.STATE] = address_state

                state = components.get(AddressFormatter.STATE)
                if state:
                    state = self.components.abbreviated_state(
                        state, country, language)
                    if state:
                        components[AddressFormatter.STATE] = state

                # This is expensive, so only turn on for files that don't supply their own city names
                # or for which those names are flawed
                osm_components = []

                # Using population=0 instead of None means if there's no known population or
                # we don't need to add OSM components, we assume the population of the town is
                # very small and the place name shouldn't be used unqualified (i.e. needs information
                # like state name to disambiguate it)
                population = 0
                unambiguous_city = False
                if add_osm_boundaries or AddressFormatter.CITY not in components:
                    osm_components = self.components.osm_reverse_geocoded_components(
                        latitude, longitude)
                    self.components.add_admin_boundaries(
                        components, osm_components, country, language,
                        latitude, longitude)
                    categorized = self.components.categorized_osm_components(
                        country, osm_components)
                    for component, label in categorized:
                        if label == AddressFormatter.CITY:
                            unambiguous_city = self.components.unambiguous_wikipedia(
                                component, language)
                            if 'population' in component:
                                population = component['population']
                            break

                if AddressFormatter.CITY not in components and city_replacements:
                    components.update({
                        k: v
                        for k, v in six.iteritems(city_replacements)
                        if k not in components
                    })

                # The neighborhood index is cheaper so can turn on for whole countries
                neighborhood_components = []
                if add_osm_neighborhoods:
                    neighborhood_components = self.components.neighborhood_components(
                        latitude, longitude)
                    self.components.add_neighborhoods(
                        components,
                        neighborhood_components,
                        country,
                        language,
                        replace_city=osm_neighborhood_overrides_city)

                self.components.cleanup_boundary_names(components)
                self.components.country_specific_cleanup(components, country)

                self.components.replace_name_affixes(components,
                                                     language,
                                                     country=country)

                self.components.replace_names(components)

                self.components.prune_duplicate_names(components)

                self.components.remove_numeric_boundary_names(components)
                self.components.add_house_number_phrase(components,
                                                        language,
                                                        country=country)
                self.components.add_postcode_phrase(components,
                                                    language,
                                                    country=country)

                # Component dropout
                all_osm_components = osm_components + neighborhood_components
                components = place_config.dropout_components(
                    components,
                    all_osm_components,
                    country=country,
                    population=population,
                    unambiguous_city=unambiguous_city)

                self.components.add_genitives(components, language)

                formatted = self.formatter.format_address(
                    components,
                    country,
                    language=language,
                    minimal_only=False,
                    tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < address_only_probability and street:
                    address_only_components = self.components.drop_places(
                        components)
                    address_only_components = self.components.drop_postcode(
                        address_only_components)
                    formatted = self.formatter.format_address(
                        address_only_components,
                        country,
                        language=language,
                        minimal_only=False,
                        tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < drop_address_probability:
                    components = self.components.drop_address(components)

                    if rand_val < place_and_postcode_probability:
                        components = self.components.drop_postcode(components)

                    if components and (len(components) > 1
                                       or add_osm_boundaries):
                        formatted = self.formatter.format_address(
                            components,
                            country,
                            language=language,
                            minimal_only=False,
                            tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self,
                            base_dir,
                            out_dir,
                            tag_components=True,
                            sources_only=None):
        all_sources_valid = sources_only is None
        valid_sources = set()
        if not all_sources_valid:
            for source in sources_only:
                if source.startswith(base_dir):
                    source = os.path.relpath(source, base_dir)

                parts = source.strip('/ ').split('/')
                if len(parts) > 3:
                    raise AssertionError(
                        'Sources may only have at maximum 3 parts')
                valid_sources.add(tuple(parts))

        if tag_components:
            formatted_tagged_file = open(
                os.path.join(out_dir,
                             OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(
                os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for country_dir in sorted(openaddresses_config.country_configs.keys()):
            country_config = openaddresses_config.country_configs[country_dir]
            # Clear country cache for each new country
            self.country_rtree.clear_cache()

            for file_config in country_config.get('files', []):
                filename = file_config['filename']

                if not all_sources_valid and not (
                    (country_dir, filename) in valid_sources or
                    (country_dir, ) in valid_sources):
                    continue

                print(six.u('doing {}/{}').format(country_dir, filename))

                path = os.path.join(base_dir, country_dir, filename)
                configs = (file_config, country_config,
                           openaddresses_config.config)
                for language, country, formatted_address in self.formatted_addresses(
                        country_dir, path, configs,
                        tag_components=tag_components):
                    if not formatted_address or not formatted_address.strip():
                        continue

                    formatted_address = tsv_string(formatted_address)
                    if not formatted_address or not formatted_address.strip():
                        continue

                    if tag_components:
                        row = (language, country, formatted_address)
                    else:
                        row = (formatted_address, )

                    writer.writerow(row)
                    i += 1
                    if i % 1000 == 0 and i > 0:
                        print('did {} formatted addresses'.format(i))
                        if self.debug:
                            break

            for subdir in sorted(country_config.get('subdirs', {}).keys()):
                subdir_config = country_config['subdirs'][subdir]
                subdir = safe_decode(subdir)
                for file_config in subdir_config.get('files', []):
                    filename = file_config['filename']

                    if not all_sources_valid and not (
                        (country_dir, subdir, filename) in valid_sources or
                        (country_dir, subdir) in valid_sources or
                        (country_dir, ) in valid_sources):
                        continue

                    print(
                        six.u('doing {}/{}/{}').format(country_dir, subdir,
                                                       filename))

                    path = os.path.join(base_dir, country_dir, subdir,
                                        filename)

                    configs = (file_config, subdir_config, country_config,
                               openaddresses_config.config)
                    for language, country, formatted_address in self.formatted_addresses(
                            country_dir,
                            path,
                            configs,
                            tag_components=tag_components):
                        if not formatted_address or not formatted_address.strip(
                        ):
                            continue

                        formatted_address = tsv_string(formatted_address)
                        if not formatted_address or not formatted_address.strip(
                        ):
                            continue

                        if tag_components:
                            row = (language, country, formatted_address)
                        else:
                            row = (formatted_address, )

                        writer.writerow(row)

                        i += 1
                        if i % 1000 == 0 and i > 0:
                            print('did {} formatted addresses'.format(i))
                            if self.debug:
                                break
コード例 #6
0
def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, infile, out_dir, tag_components=True):
    '''
    Creates formatted address training data for supervised sequence labeling (or potentially 
    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.

    Example:

    cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country

    The field structure is similar to other training data created by this script i.e.
    {language, country, data}. The data field here is a sequence of labeled tokens similar
    to what we might see in part-of-speech tagging.


    This format uses a special character "|" to denote possible breaks in the input (comma, newline).

    Note that for the address parser, we'd like it to be robust to many different types
    of input, so we may selectively eleminate components

    This information can potentially be used downstream by the sequence model as these
    breaks may be present at prediction time.

    Example:

    sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic

    This may be useful in learning word representations, statistical phrases, morphology
    or other models requiring only the sequence of words.
    '''
    i = 0

    formatter = AddressFormatter()
    osm_address_components.configure()

    if tag_components:
        formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
        writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
    else:
        formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
        writer = csv.writer(formatted_file, 'tsv_no_quote')

    remove_keys = OSM_IGNORE_KEYS

    alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries}

    for node_id, value, deps in parse_osm(infile):
        try:
            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
        except Exception:
            continue

        country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
        if not (country and candidate_languages):
            continue

        for key in remove_keys:
            _ = value.pop(key, None)

        language = None

        more_than_one_official_language = len(candidate_languages) > 1

        if tag_components:
            if len(candidate_languages) == 1:
                language = candidate_languages[0]['lang']
            else:
                street = value.get('addr:street', None)

                namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value]

                if street is not None and not namespaced:
                    language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
                elif namespaced and random.random() < 0.6:
                    language = random.choice(namespaced)
                    lang_suffix = ':{}'.format(language)
                    for k in value:
                        if k.startswith('addr:') and k.endswith(lang_suffix):
                            value[k.rstrip(lang_suffix)] = value[k]
                else:
                    language = UNKNOWN_LANGUAGE

        address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases}
        formatter.replace_aliases(address_components)

        address_country = address_components.get(AddressFormatter.COUNTRY)

        '''
        Country names
        -------------

        In OSM, addr:country is almost always an ISO-3166 alpha-2 country code.
        However, we'd like to expand these to include natural language forms
        of the country names we might be likely to encounter in a geocoder or
        handwritten address.

        These splits are somewhat arbitrary but could potentially be fit to data
        from OpenVenues or other sources on the usage of country name forms.

        If the address includes a country, the selection procedure proceeds as follows:

        1. With probability a, select the country name in the language of the address
           (determined above), or with the localized country name if the language is
           undtermined or ambiguous.

        2. With probability b(1-a), sample a language from the distribution of
           languages on the Internet and use the country's name in that language.

        3. This is implicit, but with probability (1-b)(1-a), keep the country code
        '''

        non_local_language = None

        if random.random() < 0.3:
            # 30% of the time: add Quattroshapes country
            address_country = country.upper()

        r = random.random()

        # 1. 60% of the time: use the country name in the current language or the country's local language
        if address_country and r < 0.6:
            localized = None
            if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
                localized = language_country_names.get(language, {}).get(address_country.upper())

            if not localized:
                localized = country_localized_display_name(address_country.lower())

            if localized:
                address_components[AddressFormatter.COUNTRY] = localized
        # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet
        elif address_country and r < 0.7:
            non_local_language = sample_random_language()
            lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper())
            if lang_country:
                address_components[AddressFormatter.COUNTRY] = lang_country
        # 3. 10% of the time: use the country's alpha-3 ISO code
        elif address_country and r < 0.8:
            iso_code_alpha3 = alpha3_codes.get(address_country)
            if iso_code_alpha3:
                address_components[AddressFormatter.COUNTRY] = iso_code_alpha3
        # 4. Implicit: the rest of the time keep the alpha-2 country code

        '''
        Venue names
        -----------

        Some venues have multiple names listed in OSM, grab them all
        '''

        venue_names = []
        for key in ('name', 'alt_name', 'loc_name', 'int_name', 'old_name'):
            venue_name = value.get(key)
            if venue_name:
                venue_names.append(venue_name)

        '''
        States
        ------

        Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
        whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
        name with the unabbreviated one e.g. CA => California
        '''
        address_state = address_components.get(AddressFormatter.STATE)

        if address_state and not non_local_language:
            state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)

            if state_full_name and random.random() < 0.3:
                address_components[AddressFormatter.STATE] = state_full_name
        elif address_state and non_local_language:
            _ = address_components.pop(AddressFormatter.STATE, None)

        '''
        OSM boundaries
        --------------

        For many addresses, the city, district, region, etc. are all implicitly
        generated by the reverse geocoder e.g. we do not need an addr:city tag
        to identify that 40.74, -74.00 is in New York City as well as its parent
        geographies (New York county, New York state, etc.)

        Where possible we augment the addr:* tags with some of the reverse-geocoded
        relations from OSM.

        Since addresses found on the web may have the same properties, we
        include these qualifiers in the training data.
        '''

        osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude)

        if non_local_language is not None:
            osm_suffix = ':{}'.format(non_local_language)
        elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
            osm_suffix = ':{}'.format(language)
        else:
            osm_suffix = ''

        name_key = ''.join(('name', osm_suffix))
        raw_name_key = 'name'
        simple_name_key = 'name:simple'
        international_name_key = 'int_name'

        iso_code_key = 'ISO3166-1:alpha2'
        iso_code3_key = 'ISO3166-1:alpha3'

        if osm_components:
            poly_components = defaultdict(list)

            existing_city_name = address_components.get(AddressFormatter.CITY)

            for component, components_values in osm_components.iteritems():
                seen = set()

                key, raw_key = osm_pick_random_name_key(suffix=osm_suffix)

                for component_value in components_values:
                    r = random.random()
                    name = None

                    if iso_code3_key in component_value and r < 0.1:
                        name = component_value[iso_code3_key]
                    elif iso_code_key in component_value and r < 0.3:
                        name = component_value[iso_code_key]
                    elif language == 'en' and not non_local_language and r < 0.7:
                        # Particularly to address the US (prefer United States,
                        # not United States of America) but may capture variations
                        # in other English-speaking countries as well.
                        if simple_name_key in component_value:
                            name = component_value[simple_name_key]
                        elif international_name_key in component_value:
                            name = component_value[international_name_key]
                    if not name:
                        name = component_value.get(key, component_value.get(raw_key))

                    if not name or (component != AddressFormatter.CITY and name == existing_city_name):
                        name = component_value.get(name_key, component_value.get(raw_name_key))

                    if not name or (component != AddressFormatter.CITY and name == existing_city_name):
                        continue

                    if (component, name) not in seen:
                        poly_components[component].append(name)
                        seen.add((component, name))

            for component, vals in poly_components.iteritems():
                if component not in address_components or (non_local_language and random.random() < 0.4):
                    if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5:
                        num = random.randrange(1, len(vals) + 1)
                        val = u', '.join(vals[:num])
                    else:
                        val = random.choice(vals)

                    if component == AddressFormatter.STATE and random.random() < 0.7:
                        val = STATE_EXPANSIONS.get(address_country, {}).get(val, val)
                    address_components[component] = val

        '''
        Quattroshapes/GeoNames cities
        -----------------------------

        Quattroshapes isn't great for everything, but it has decent city boundaries
        in places where OSM sometimes does not (or at least in places where we aren't
        currently able to create valid polygons). While Quattroshapes itself doesn't
        reliably use local names, which we'll want for consistency
        '''

        if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < 0.2):
            lang = non_local_language or language
            quattroshapes_cities = quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
            for result in quattroshapes_cities:
                if result.get(quattroshapes_rtree.LEVEL) == quattroshapes_rtree.LOCALITY and quattroshapes_rtree.GEONAMES_ID in result:
                    geonames_id = int(result[quattroshapes_rtree.GEONAMES_ID].split(',')[0])
                    names = geonames.get_alternate_names(geonames_id)

                    if not names or lang not in names:
                        continue

                    city = None
                    if 'abbr' not in names or non_local_language:
                        # Use the common city name in the target language
                        city = names[lang][0][0]
                    elif random.random() < 0.1:
                        # Use an abbreviation: NYC, BK, SF, etc.
                        city = random.choice(names['abbr'])[0]

                    if not city or not city.strip():
                        continue
                    address_components[AddressFormatter.CITY] = city
                    break
            else:
                if non_local_language and AddressFormatter.CITY in address_components and (
                        AddressFormatter.CITY_DISTRICT in osm_components or
                        AddressFormatter.SUBURB in osm_components):
                    address_components.pop(AddressFormatter.CITY)

        '''
        Neighborhoods
        -------------

        In some cities, neighborhoods may be included in a free-text address.

        OSM includes many neighborhoods but only as points, rather than the polygons
        needed to perform reverse-geocoding. We use a hybrid index containing
        Quattroshapes/Zetashapes polygons matched fuzzily with OSM names (which are
        on the whole of better quality).
        '''

        neighborhoods = neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
        neighborhood_levels = defaultdict(list)
        for neighborhood in neighborhoods:
            place_type = neighborhood.get('place')
            polygon_type = neighborhood.get('polygon_type')

            key, raw_key = osm_pick_random_name_key(suffix=osm_suffix)
            name = neighborhood.get(key, neighborhood.get(raw_key))

            if not name:
                name = neighborhood.get(name_key, neighborhood.get(raw_name_key))

                name_prefix = neighborhood.get('name:prefix')

                if name_prefix and random.random() < 0.5:
                    name = u' '.join([name_prefix, name])

            if not name:
                continue

            neighborhood_level = AddressFormatter.SUBURB

            if place_type == 'borough' or polygon_type == 'local_admin':
                neighborhood_level = AddressFormatter.CITY_DISTRICT

                # Optimization so we don't use e.g. Brooklyn multiple times
                city_name = address_components.get(AddressFormatter.CITY)
                if name == city_name:
                    name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
                    if not name or name == city_name:
                        continue

            neighborhood_levels[neighborhood_level].append(name)

        for component, neighborhoods in neighborhood_levels.iteritems():
            if component not in address_components and random.random() < 0.5:
                address_components[component] = neighborhoods[0]

        '''
        Name normalization
        ------------------

        Probabilistically strip standard prefixes/suffixes e.g. "London Borough of"
        '''
        for component in BOUNDARY_COMPONENTS:
            name = address_components.get(component)
            if not name:
                continue
            replacement = replace_name_prefixes(replace_name_suffixes(name))
            if replacement != name and random.random() < 0.6:
                address_components[component] = replacement

        '''
        Name deduping
        -------------

        For some cases like "Antwerpen, Antwerpen, Antwerpen"
        that are very unlikely to occur in real life.
        '''

        name_components = defaultdict(list)

        for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
            name = address_components.get(component)
            if name:
                name_components[name].append(component)

        for name, components in name_components.iteritems():
            if len(components) > 1:
                for component in components[1:]:
                    address_components.pop(component, None)


        '''
        House number cleanup
        --------------------

        For some OSM nodes, particularly in Uruguay, we get house numbers
        that are actually a comma-separated list.

        If there's one comma in the house number, allow it as it might
        be legitimate, but if there are 2 or more, just take the first one.
        '''

        house_number = address_components.get(AddressFormatter.HOUSE_NUMBER)
        if house_number and house_number.count(',') >= 2:
            for num in house_number.split(','):
                num = num.strip()
                if num:
                    address_components[AddressFormatter.HOUSE_NUMBER] = num
                    break
            else:
                address_components.pop(AddressFormatter.HOUSE_NUMBER, None)

        # Version with all components
        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)

        if tag_components:
            formatted_addresses = []
            formatted_addresses.append(formatted_address)

            seen = set([formatted_address])

            address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
            if not address_components:
                continue

            current_components = address_components.keys()
            random.shuffle(current_components)

            component_set = component_bitset(address_components.keys())

            for component in current_components:
                if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5:
                    address_components.pop(component)
                    component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component]
                    if not address_components:
                        break

                    # Since venue names are 1-per-record, we must use them all
                    for venue_name in (venue_names or [None]):
                        if venue_name and AddressFormatter.HOUSE in address_components:
                            address_components[AddressFormatter.HOUSE] = venue_name
                        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
                        if formatted_address not in seen:
                            formatted_addresses.append(formatted_address)
                            seen.add(formatted_address)

            for formatted_address in formatted_addresses:
                if formatted_address and formatted_address.strip():
                    formatted_address = tsv_string(formatted_address)
                    if not formatted_address or not formatted_address.strip():
                        continue
                    row = (language, country, formatted_address)

                    writer.writerow(row)
        elif formatted_address and formatted_address.strip():
            formatted_address = tsv_string(formatted_address)
            writer.writerow([formatted_address])

        i += 1
        if i % 1000 == 0 and i > 0:
            print 'did', i, 'formatted addresses'
コード例 #7
0
 def __init__(self):
     self.formatter = AddressFormatter()
コード例 #8
0
ファイル: formatter.py プロジェクト: BERENZ/libpostal
class OpenAddressesFormatter(object):
    field_regex_replacements = {
        # All fields
        None: [
            (re.compile('<\s*null\s*>', re.I), u''),
            (re.compile('[\s]{2,}'), six.u(' ')),
            (re.compile('\`'), u"'"),
            (re.compile('\-?\*'), u""),
        ],
        AddressFormatter.HOUSE_NUMBER: [
            # Most of the house numbers in Montreal start with "#"
            (re.compile('^#', re.UNICODE), u''),
            # Some house numbers have multiple hyphens
            (re.compile('[\-]{2,}'), u'-'),
            # Some house number ranges are split up like "12 -14"
            (re.compile('[\s]*\-[\s]*'), u'-'),
        ]
    }

    unit_type_regexes = {}

    for (lang, dictionary_type), values in six.iteritems(address_phrase_dictionaries.phrases):
        if dictionary_type == 'unit_types_numbered':
            unit_phrases = [safe_encode(p) for p in itertools.chain(*values) if len(p) > 2]
            pattern = re.compile(r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'.format(safe_encode('|').join(unit_phrases)),
                                 re.I | re.UNICODE)
            unit_type_regexes[lang] = pattern

    def __init__(self, components, country_rtree, debug=False):
        self.components = components
        self.country_rtree = country_rtree

        self.debug = debug

        self.formatter = AddressFormatter()

    class validators:
        @classmethod
        def validate_postcode(cls, postcode):
            '''
            Postcodes that are all zeros are improperly-formatted NULL values
            '''
            return not all((c in ('0', '-', '.', ' ', ',') for c in postcode))

        @classmethod
        def validate_street(cls, street):
            '''
            Streets should not be simple numbers. If they are it's probably a
            copy/paste error and should be the house number.
            '''
            return not is_numeric(street)

        @classmethod
        def validate_house_number(cls, house_number):
            '''
            House number doesn't necessarily have to be numeric, but in some of the
            OpenAddresses data sets the house number field is equal to the capitalized
            street name, so this at least provides protection against insane values
            for house number at the cost of maybe missing a few houses numbered "A", etc.

            Also OpenAddresses primarily comes from county GIS servers, etc. which use
            a variety of database schemas and don't always handle NULLs very well. Again,
            while a single zero is a valid house number, in OpenAddresses it's more likely
            an error

            While a single zero is a valid house number, more than one zero is not, or
            at least not in OpenAddresses
            '''

            try:
                house_number = int(house_number.strip())
                return house_number > 0
            except (ValueError, TypeError):
                house_number = house_number.strip()
                return house_number and (is_numeric(house_number) or fraction_regex.match(house_number) or number_space_letter_regex.match(house_number) or
                                         number_slash_number_regex.match(house_number) or number_fraction_regex.match(house_number)) and not all((c == '0' for c in house_number if c.isdigit()))

        @classmethod
        def validate_house_number_sin_numero(cls, house_number):
            if sin_numero_regex.match(house_number):
                return True
            return cls.validate_house_number(house_number)

        @classmethod
        def validate_russian_house_number(cls, house_number):
            if dom_korpus_stroyeniye_regex.match(house_number):
                return True
            elif uchastok_regex.match(house_number):
                return True
            elif bea_nomera_regex.match(house_number):
                return True
            return cls.validate_house_number(house_number)

        @classmethod
        def validate_colombian_house_number(cls, house_number):
            return True

        @classmethod
        def validate_chinese_house_number(cls, house_number):
            if not house_number:
                return False
            tokens = tokenize(house_number)

            if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens):
                return True
            return cls.validate_house_number(house_number)

    component_validators = {
        AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
        AddressFormatter.ROAD: validators.validate_street,
        AddressFormatter.POSTCODE: validators.validate_postcode,
    }

    language_validators = {
        SPANISH: {
            AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
        },
        PORTUGUESE: {
            AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
        },
        RUSSIAN: {
            AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
        },
        CHINESE: {
            AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number,
        }
    }

    country_validators = {
        Countries.COLOMBIA: {
            AddressFormatter.HOUSE_NUMBER: validators.validate_colombian_house_number
        }
    }

    chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U)

    @classmethod
    def format_chinese_house_number(cls, house_number):
        if not house_number:
            return house_number
        return cls.chinese_annex_regex.sub(u'\\1号', house_number)

    @classmethod
    def format_colombian_house_number(cls, house_number):
        house_number = house_number.strip()
        match = colombian_standard_house_number_regex.match(house_number)
        if match:
            separator = random.choice((u'-', u' - ', u' '))

            cross_street, building_number = match.groups()

            numbers = []
            if cross_street and u' ' in cross_street and random.choice((True, False)):
                cross_street = cross_street.replace(u' ', u'')

            if cross_street:
                numbers.append(cross_street)

            if building_number and u' ' in building_number and random.choice((True, False)):
                building_number = building_number.replace(u' ', u'')

            if building_number:
                numbers.append(building_number)

            if numbers:
                house_number = separator.join(numbers)
                house_number_prefixes = (u'#', u'no.', u'no', u'nº')
                if random.choice((True, False)) and not any((house_number.lower().startswith(p) for p in house_number_prefixes)):
                    house_number = u' '.join([random.choice(house_number_prefixes), house_number])

        return house_number

    def get_property(self, key, *configs):
        for config in configs:
            value = config.get(key, None)
            if value is not None:
                return value
        return None

    def cldr_country_name(self, country_code, language, configs):
        cldr_country_prob = float(self.get_property('cldr_country_probability', *configs))

        country_name = None

        if random.random() < cldr_country_prob:
            localized, iso_3166, alpha2, alpha3 = values = range(4)
            localized_prob = float(self.get_property('localized_name_probability', *configs))
            iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs))
            alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
            alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))

            probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])

            country_type = weighted_choice(values, probs)

            country_name = country_code.upper()
            if country_type == localized:
                country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
            elif country_type == iso_3166:
                country_name = country_names.iso3166_name(country_code)
            elif country_type == alpha3:
                country_name = country_names.alpha3_code(country_code) or country_name

        return country_name

    @classmethod
    def cleanup_number(cls, num, strip_commas=False):
        num = num.strip()
        if strip_commas:
            num = num.replace(six.u(','), six.u(''))
        try:
            num_int = int(num)
        except (ValueError, TypeError):
            try:
                num_float = float(num)
                leading_zeros = 0
                for c in num:
                    if c == six.u('0'):
                        leading_zeros += 1
                    else:
                        break
                num = safe_decode(int(num_float))
                if leading_zeros:
                    num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
            except (ValueError, TypeError):
                pass
        return num

    @classmethod
    def fix_component_encodings(cls, components):
        return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}

    def formatted_addresses(self, country_dir, path, configs, tag_components=True):
        abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
        separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
        abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
        separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
        abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs))

        add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False)
        add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False)
        osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs)
        non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False)
        house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False)
        numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
        postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)

        address_only_probability = float(self.get_property('address_only_probability', *configs))
        place_only_probability = float(self.get_property('place_only_probability', *configs))
        place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs))

        city_replacements = self.get_property('city_replacements', *configs)

        override_country_dir = self.get_property('override_country_dir', *configs)

        postcode_length = int(self.get_property('postcode_length', *configs) or 0)

        drop_address_probability = place_only_probability + place_and_postcode_probability

        ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])

        ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
                                    for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))}

        alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
                                   for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))}

        config_language = self.get_property('language', *configs)

        add_components = self.get_property('add', *configs)

        fields = self.get_property('fields', *configs)
        if not fields:
            return

        field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)}
        mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')}

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map}
        latitude_index = headers.index('LAT')
        longitude_index = headers.index('LON')

        # Clear cached polygons
        self.components.osm_admin_rtree.clear_cache()
        self.components.neighborhoods_rtree.clear_cache()

        for row in reader:
            try:
                latitude = float(row[latitude_index])
                longitude = float(row[longitude_index])
            except (ValueError, TypeError):
                continue

            language = config_language

            components = {}

            skip_record = False

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value and key in ignore_rows_missing_fields:
                    skip_record = True
                    break
                elif not value:
                    continue

                if key in mapped_values:
                    value = mapped_values[key].get(value, value)

                if key == AddressFormatter.ROAD and language == SPANISH:
                    value = self.components.spanish_street_name(value)

                if key == AddressFormatter.POSTCODE:
                    value = self.cleanup_number(value)

                    if postcode_strip_non_digit_chars:
                        value = six.u('').join((c for c in value if c.isdigit()))

                    if value and not is_numeric(value) and numeric_postcodes_only:
                        continue
                    else:
                        if postcode_length:
                            value = value.zfill(postcode_length)[:postcode_length]

                if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
                    if add_osm_boundaries:
                        continue
                    value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
                    if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)):
                        continue

                if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
                    continue

                for exp, sub_val in self.field_regex_replacements.get(key, []):
                    value = exp.sub(sub_val, value)

                for exp, sub_val in self.field_regex_replacements.get(None, []):
                    value = exp.sub(sub_val, value)

                value = value.strip(', -')

                validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None)))

                if validator is not None and not validator(value):
                    continue

                if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
                    continue

                for (pattern, alias) in alias_fields_containing.get(key, []):
                    if pattern.search(value):
                        if 'component' in alias:
                            key = alias['component']

                if value:
                    components[key] = value

            if skip_record:
                continue

            if components:
                country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
                if not (country and candidate_languages) or (country != country_dir and not override_country_dir):
                    country = country_dir
                    candidate_languages = get_country_languages(country)
                    if not candidate_languages:
                        continue
                    candidate_languages = candidate_languages.items()

                components = self.fix_component_encodings(components)

                if language is None:
                    language = AddressComponents.address_language(components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)

                    if language == UNKNOWN_LANGUAGE:
                        strip_unit_language = candidate_languages[0][0] if candidate_languages else None
                    else:
                        strip_unit_language = language

                    street = self.components.strip_unit_phrases_for_language(street, strip_unit_language)

                    street = abbreviate(street_types_gazetteer, street, language,
                                        abbreviate_prob=abbreviate_street_prob,
                                        separate_prob=separate_street_prob)
                    components[AddressFormatter.ROAD] = street

                house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
                if house_number:
                    house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)

                    if language == CHINESE:
                        house_number = self.format_chinese_house_number(house_number)

                    if country_dir == Countries.COLOMBIA:
                        house_number = self.format_colombian_house_number(house_number)

                    if house_number is not None:
                        components[AddressFormatter.HOUSE_NUMBER] = house_number

                unit = components.get(AddressFormatter.UNIT, None)

                street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES

                postcode = components.get(AddressFormatter.POSTCODE, None)

                if postcode:
                    components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()):
                    if not postcode:
                        continue
                    components = self.components.drop_address(components)

                # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
                unit = components.get(AddressFormatter.UNIT, None)

                if unit is not None:
                    if is_numeric_strict(unit):
                        unit = Unit.phrase(unit, language, country=country)
                    elif non_numeric_units:
                        unit = abbreviate(unit_types_gazetteer, unit, language,
                                          abbreviate_prob=abbreviate_unit_prob,
                                          separate_prob=separate_unit_prob)
                    else:
                        unit = None

                    if unit is not None:
                        components[AddressFormatter.UNIT] = unit
                    else:
                        components.pop(AddressFormatter.UNIT)
                        unit = None

                # CLDR country name
                country_name = self.cldr_country_name(country, language, configs)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(toponym_abbreviations_gazetteer, component, language,
                                               abbreviate_prob=abbreviate_toponym_prob)
                        component = self.components.name_hyphens(component)
                        components[component_key] = component

                # Any components specified to be added by the config (usually state)
                if add_components:
                    for k, v in six.iteritems(add_components):
                        if k not in components:
                            components[k] = v

                # Get named states occasionally, added component is usually a state code
                address_state = self.components.state_name(components, country, language)
                if address_state:
                    components[AddressFormatter.STATE] = address_state

                state = components.get(AddressFormatter.STATE)
                if state:
                    state = self.components.abbreviated_state(state, country, language)
                    if state:
                        components[AddressFormatter.STATE] = state

                # This is expensive, so only turn on for files that don't supply their own city names
                # or for which those names are flawed
                osm_components = []

                # Using population=0 instead of None means if there's no known population or
                # we don't need to add OSM components, we assume the population of the town is
                # very small and the place name shouldn't be used unqualified (i.e. needs information
                # like state name to disambiguate it)
                population = 0
                unambiguous_city = False
                if add_osm_boundaries or AddressFormatter.CITY not in components:
                    osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
                    self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude)
                    categorized = self.components.categorized_osm_components(country, osm_components)
                    for component, label in categorized:
                        if label == AddressFormatter.CITY:
                            unambiguous_city = self.components.unambiguous_wikipedia(component, language)
                            if 'population' in component:
                                population = component['population']
                            break

                if AddressFormatter.CITY not in components and city_replacements:
                    components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components})

                # The neighborhood index is cheaper so can turn on for whole countries
                neighborhood_components = []
                if add_osm_neighborhoods:
                    neighborhood_components = self.components.neighborhood_components(latitude, longitude)
                    self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city)

                self.components.cleanup_boundary_names(components)
                self.components.country_specific_cleanup(components, country)

                self.components.replace_name_affixes(components, language, country=country)

                self.components.replace_names(components)

                self.components.prune_duplicate_names(components)

                self.components.remove_numeric_boundary_names(components)
                self.components.add_house_number_phrase(components, language, country=country)
                self.components.add_postcode_phrase(components, language, country=country)

                # Component dropout
                all_osm_components = osm_components + neighborhood_components
                components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)

                self.components.add_genitives(components, language)

                formatted = self.formatter.format_address(components, country, language=language,
                                                          minimal_only=False, tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < address_only_probability and street:
                    address_only_components = self.components.drop_places(components)
                    address_only_components = self.components.drop_postcode(address_only_components)
                    formatted = self.formatter.format_address(address_only_components, country, language=language,
                                                              minimal_only=False, tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < drop_address_probability:
                    components = self.components.drop_address(components)

                    if rand_val < place_and_postcode_probability:
                        components = self.components.drop_postcode(components)

                    if components and (len(components) > 1 or add_osm_boundaries):
                        formatted = self.formatter.format_address(components, country, language=language,
                                                                  minimal_only=False, tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None):
        all_sources_valid = sources_only is None
        valid_sources = set()
        if not all_sources_valid:
            for source in sources_only:
                if source.startswith(base_dir):
                    source = os.path.relpath(source, base_dir)

                parts = source.strip('/ ').split('/')
                if len(parts) > 3:
                    raise AssertionError('Sources may only have at maximum 3 parts')
                valid_sources.add(tuple(parts))

        if tag_components:
            formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for country_dir in sorted(openaddresses_config.country_configs.keys()):
            country_config = openaddresses_config.country_configs[country_dir]
            # Clear country cache for each new country
            self.country_rtree.clear_cache()

            for file_config in country_config.get('files', []):
                filename = file_config['filename']

                if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources):
                    continue

                print(six.u('doing {}/{}').format(country_dir, filename))

                path = os.path.join(base_dir, country_dir, filename)
                configs = (file_config, country_config, openaddresses_config.config)
                for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
                    if not formatted_address or not formatted_address.strip():
                        continue

                    formatted_address = tsv_string(formatted_address)
                    if not formatted_address or not formatted_address.strip():
                        continue

                    if tag_components:
                        row = (language, country, formatted_address)
                    else:
                        row = (formatted_address,)

                    writer.writerow(row)
                    i += 1
                    if i % 1000 == 0 and i > 0:
                        print('did {} formatted addresses'.format(i))
                        if self.debug:
                            break

            for subdir in sorted(country_config.get('subdirs', {}).keys()):
                subdir_config = country_config['subdirs'][subdir]
                subdir = safe_decode(subdir)
                for file_config in subdir_config.get('files', []):
                    filename = file_config['filename']

                    if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources):
                        continue

                    print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename))

                    path = os.path.join(base_dir, country_dir, subdir, filename)

                    configs = (file_config, subdir_config, country_config, openaddresses_config.config)
                    for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
                        if not formatted_address or not formatted_address.strip():
                            continue

                        formatted_address = tsv_string(formatted_address)
                        if not formatted_address or not formatted_address.strip():
                            continue

                        if tag_components:
                            row = (language, country, formatted_address)
                        else:
                            row = (formatted_address,)

                        writer.writerow(row)

                        i += 1
                        if i % 1000 == 0 and i > 0:
                            print('did {} formatted addresses'.format(i))
                            if self.debug:
                                break
コード例 #9
0
    def __init__(self, geoplanet_db):
        self.db = sqlite3.connect(geoplanet_db)

        # These aren't too large and it's easier to have them in memory
        self.places = {
            row[0]: row[1:]
            for row in self.db.execute('select * from places')
        }
        self.aliases = defaultdict(list)

        self.coterminous_admins = {}
        self.admins_with_ambiguous_city = set()

        print('Doing admin ambiguities')
        for row in self.db.execute('''select p.id,
                                             (select count(*) from places where parent_id = p.id) as num_places,
                                             (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
                                             p2.id
                                      from places p
                                      join places p2
                                          on p2.parent_id = p.id
                                          and p.name = p2.name
                                          and p.place_type != "Town"
                                          and p2.place_type = "Town"
                                      group by p.id'''):
            place_id, num_places, num_towns, coterminous_town_id = row
            num_places = int(num_places)
            num_towns = int(num_towns)

            if num_places == 1 and num_towns == 1:
                self.coterminous_admins[place_id] = coterminous_town_id
            self.admins_with_ambiguous_city.add(place_id)

        print('num coterminous: {}'.format(len(self.coterminous_admins)))
        print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))

        print('Doing aliases')
        for row in self.db.execute('''select a.* from aliases a
                                      left join places p
                                          on a.id = p.id
                                          and p.place_type in ("State", "County")
                                          and a.language != p.language
                                      where name_type != "S" -- no colloquial aliases like "The Big Apple"
                                      and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
                                      and p.id is NULL -- exclude foreign-language states/county names
                                      order by id, language,
                                      case name_type
                                          when "P" then 1
                                          when "Q" then 2
                                          when "V" then 3
                                          when "A" then 4
                                          when "S" then 5
                                          else 6
                                      end'''):
            place = self.places.get(row[0])
            if not place:
                continue

            self.aliases[row[0]].append(row[1:])

        print('Doing variant aliases')
        variant_aliases = 0
        for i, row in enumerate(
                self.db.execute(
                    '''select a.*, p.name, p.country_code from aliases a
                                                   join places p using(id)
                                                   where a.name_type = "V"
                                                   and a.language = p.language'''
                )):
            place_name, country_code = row[-2:]
            country = country_code.lower()

            row = row[:-2]
            place_id, alias, name_type, language = row

            language = self.language_codes[language]
            if language != 'unk':
                alias_sans_affixes = name_affixes.replace_affixes(
                    alias, language, country=country)
                if alias_sans_affixes:
                    alias = alias_sans_affixes

                place_name_sans_affixes = name_affixes.replace_affixes(
                    place_name, language, country=country)
                if place_name_sans_affixes:
                    place_name = place_name_sans_affixes
            else:
                language = None

            if equivalent(place_name, alias, toponym_abbreviations_gazetteer,
                          language):
                self.aliases[row[0]].append(row[1:])
                variant_aliases += 1

            if i % 10000 == 0 and i > 0:
                print('tested {} variant aliases with {} positives'.format(
                    i, variant_aliases))

        self.aliases = dict(self.aliases)

        self.formatter = AddressFormatter()
コード例 #10
0
class OpenAddressesUKFormatter(object):
    field_map = {
        'pao': AddressFormatter.HOUSE_NUMBER,
        'street.name': AddressFormatter.ROAD,
        'town.name': AddressFormatter.CITY,
        'postcode.name': AddressFormatter.POSTCODE
    }

    def __init__(self):
        self.formatter = AddressFormatter()

    component_validators = {
        AddressFormatter.HOUSE_NUMBER: OpenAddressesFormatter.validators.validate_house_number,
        AddressFormatter.ROAD: OpenAddressesFormatter.validators.validate_street,
        AddressFormatter.POSTCODE: OpenAddressesFormatter.validators.validate_postcode,
    }

    cldr_country_probability = 0.3
    address_only_probability = 0.4
    drop_address_probability = 0.6
    drop_address_and_postcode_probability = 0.1

    @classmethod
    def cleanup_number(cls, num, strip_commas=False):
        num = num.strip()
        if strip_commas:
            num = num.replace(six.u(','), six.u(''))
        try:
            num_int = int(num)
        except (ValueError, TypeError):
            try:
                num_float = float(num)
                leading_zeros = 0
                for c in num:
                    if c == six.u('0'):
                        leading_zeros += 1
                    else:
                        break
                num = safe_decode(int(num_float))
                if leading_zeros:
                    num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
            except (ValueError, TypeError):
                pass
        return num

    def fix_component_encodings(self, components):
        return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}

    def formatted_addresses(self, path, tag_components=True):
        country = Countries.UNITED_KINGDOM
        candidate_languages = get_country_languages(country).items()

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {i: self.field_map[k] for i, k in enumerate(headers) if k in self.field_map}

        for row in reader:
            components = {}

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value:
                    continue

                if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
                    continue

                value = value.strip(', -')

                validator = self.component_validators.get(key, None)

                if validator is not None and not validator(value):
                    continue

                if value:
                    components[key] = value

            if components:
                components = self.fix_component_encodings(components)

                language = AddressComponents.address_language(components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)
                    if AddressComponents.street_name_is_valid(street):

                        street = abbreviate(street_types_gazetteer, street, language)
                        components[AddressFormatter.ROAD] = street
                    else:
                        components.pop(AddressFormatter.ROAD)
                        street = None

                house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
                if house_number:
                    house_number = self.cleanup_number(house_number, strip_commas=True)

                    if house_number is not None:
                        components[AddressFormatter.HOUSE_NUMBER] = house_number

                postcode = components.get(AddressFormatter.POSTCODE, None)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if not street or (street and house_number and (street.lower() == house_number.lower())):
                    if not postcode:
                        continue
                    components = AddressComponents.drop_address(components)

                country_name = AddressComponents.cldr_country_name(country, language)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(toponym_abbreviations_gazetteer, component, language)
                        component = AddressComponents.name_hyphens(component)
                        components[component_key] = component

                AddressComponents.replace_names(components)

                AddressComponents.prune_duplicate_names(components)

                AddressComponents.remove_numeric_boundary_names(components)
                AddressComponents.add_house_number_phrase(components, language, country=country)

                # Component dropout
                components = place_config.dropout_components(components, country=country)

                formatted = self.formatter.format_address(components, country, language=language,
                                                          minimal_only=False, tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < self.address_only_probability and street:
                    address_only_components = AddressComponents.drop_places(components)
                    address_only_components = AddressComponents.drop_postcode(address_only_components)
                    formatted = self.formatter.format_address(address_only_components, country, language=language,
                                                              minimal_only=False, tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < self.drop_address_probability:
                    components = AddressComponents.drop_address(components)

                    if rand_val < self.drop_address_and_postcode_probability:
                        components = AddressComponents.drop_postcode(components)

                    if components and (len(components) > 1):
                        formatted = self.formatter.format_address(components, country, language=language,
                                                                  minimal_only=False, tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self, infile, out_dir, tag_components=True):
        if tag_components:
            formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_TAGGED_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for language, country, formatted_address in self.formatted_addresses(infile, tag_components=tag_components):
            if tag_components:
                row = (language, country, formatted_address)
            else:
                row = (formatted_address,)

            writer.writerow(row)
            i += 1
            if i % 1000 == 0 and i > 0:
                print('did {} formatted addresses'.format(i))
コード例 #11
0
class GeoPlanetFormatter(object):
    # Map of GeoPlanet language codes to ISO-639 alpha2 language codes
    language_codes = {
        'ENG': 'en',
        'JPN': 'ja',
        'GER': 'de',
        'SPA': 'es',
        'FRE': 'fr',
        'UNK': 'unk',
        'ITA': 'it',
        'POR': 'pt',
        'POL': 'pl',
        'ARA': 'ar',
        'CZE': 'cs',
        'SWE': 'sv',
        'CHI': 'zh',
        'RUM': 'ro',
        'FIN': 'fi',
        'DUT': 'nl',
        'NOR': 'nb',
        'DAN': 'da',
        'HUN': 'hu',
        'KOR': 'kr',
    }

    non_latin_script_languages = {
        'JPN',  # Japanese
        'ARA',  # Arabic
        'CHI',  # Chinese
        'KOR',  # Korean
    }

    ALIAS_PREFERRED = 'P'
    ALIAS_PREFERRED_FOREIGN = 'Q'
    ALIAS_VARIANT = 'V'
    ALIAS_ABBREVIATED = 'A'
    ALIAS_COLLOQUIAL = 'S'

    # Map of GeoPlanet place types to address formatter types
    place_types = {
        'Continent': AddressFormatter.WORLD_REGION,
        'Country': AddressFormatter.COUNTRY,
        'CountryRegion': AddressFormatter.COUNTRY_REGION,
        'State': AddressFormatter.STATE,
        'County': AddressFormatter.STATE_DISTRICT,
        'Island': AddressFormatter.ISLAND,
        'Town': AddressFormatter.CITY,
        # Note: if we do general place queris from GeoPlanet, this
        # may have to be mapped more carefully
        'LocalAdmin': AddressFormatter.CITY_DISTRICT,
        'Suburb': AddressFormatter.SUBURB,
    }

    def __init__(self, geoplanet_db):
        self.db = sqlite3.connect(geoplanet_db)

        # These aren't too large and it's easier to have them in memory
        self.places = {
            row[0]: row[1:]
            for row in self.db.execute('select * from places')
        }
        self.aliases = defaultdict(list)

        self.coterminous_admins = {}
        self.admins_with_ambiguous_city = set()

        print('Doing admin ambiguities')
        for row in self.db.execute('''select p.id,
                                             (select count(*) from places where parent_id = p.id) as num_places,
                                             (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
                                             p2.id
                                      from places p
                                      join places p2
                                          on p2.parent_id = p.id
                                          and p.name = p2.name
                                          and p.place_type != "Town"
                                          and p2.place_type = "Town"
                                      group by p.id'''):
            place_id, num_places, num_towns, coterminous_town_id = row
            num_places = int(num_places)
            num_towns = int(num_towns)

            if num_places == 1 and num_towns == 1:
                self.coterminous_admins[place_id] = coterminous_town_id
            self.admins_with_ambiguous_city.add(place_id)

        print('num coterminous: {}'.format(len(self.coterminous_admins)))
        print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))

        print('Doing aliases')
        for row in self.db.execute('''select a.* from aliases a
                                      left join places p
                                          on a.id = p.id
                                          and p.place_type in ("State", "County")
                                          and a.language != p.language
                                      where name_type != "S" -- no colloquial aliases like "The Big Apple"
                                      and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
                                      and p.id is NULL -- exclude foreign-language states/county names
                                      order by id, language,
                                      case name_type
                                          when "P" then 1
                                          when "Q" then 2
                                          when "V" then 3
                                          when "A" then 4
                                          when "S" then 5
                                          else 6
                                      end'''):
            place = self.places.get(row[0])
            if not place:
                continue

            self.aliases[row[0]].append(row[1:])

        print('Doing variant aliases')
        variant_aliases = 0
        for i, row in enumerate(
                self.db.execute(
                    '''select a.*, p.name, p.country_code from aliases a
                                                   join places p using(id)
                                                   where a.name_type = "V"
                                                   and a.language = p.language'''
                )):
            place_name, country_code = row[-2:]
            country = country_code.lower()

            row = row[:-2]
            place_id, alias, name_type, language = row

            language = self.language_codes[language]
            if language != 'unk':
                alias_sans_affixes = name_affixes.replace_affixes(
                    alias, language, country=country)
                if alias_sans_affixes:
                    alias = alias_sans_affixes

                place_name_sans_affixes = name_affixes.replace_affixes(
                    place_name, language, country=country)
                if place_name_sans_affixes:
                    place_name = place_name_sans_affixes
            else:
                language = None

            if equivalent(place_name, alias, toponym_abbreviations_gazetteer,
                          language):
                self.aliases[row[0]].append(row[1:])
                variant_aliases += 1

            if i % 10000 == 0 and i > 0:
                print('tested {} variant aliases with {} positives'.format(
                    i, variant_aliases))

        self.aliases = dict(self.aliases)

        self.formatter = AddressFormatter()

    def get_place_hierarchy(self, place_id):
        all_places = []
        original_place_id = place_id
        place = self.places[place_id]
        all_places.append((place_id, ) + place)
        place_id = place[-1]
        while place_id != 1 and place_id != original_place_id:
            place = self.places[place_id]
            all_places.append((place_id, ) + place)
            place_id = place[-1]
        return all_places

    def get_aliases(self, place_id):
        return self.aliases.get(place_id, [])

    def cleanup_name(self, name):
        return name.strip(' ,-')

    def format_postal_codes(self, tag_components=True):
        all_postal_codes = self.db.execute('select * from postal_codes')
        for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes:
            country = country.lower()
            postcode_language = language

            language = self.language_codes[language]

            if len(postal_code) <= 3:
                postcode_regex = postcode_regexes.get(country)

                valid_postcode = False
                if postcode_regex:
                    match = postcode_regex.match(postal_code)
                    if match and match.end() == len(postal_code):
                        valid_postcode = True

                if not valid_postcode:
                    continue

            # If the county/state is coterminous with a city and contains only one place,
            # set the parent_id to the city instead
            if parent_id in self.coterminous_admins:
                parent_id = self.coterminous_admins[parent_id]

            place_hierarchy = self.get_place_hierarchy(parent_id)

            containing_places = defaultdict(set)

            language_places = {None: containing_places}

            original_language = language

            have_default_language = False

            if place_hierarchy:
                base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0]
                base_place_type = self.place_types[base_place_type]
            else:
                base_place_id = None
                base_place_type = None

            place_types_seen = set()

            for place_id, country, name, lang, place_type, parent in place_hierarchy:
                country = country.lower()

                # First language
                if not have_default_language and lang != postcode_language:
                    language = self.language_codes[lang]
                    have_default_language = True

                place_type = self.place_types[place_type]
                if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city:
                    continue

                name = self.cleanup_name(name)
                containing_places[place_type].add(name)

                aliases = self.get_aliases(place_id)
                for name, name_type, alias_lang in aliases:
                    if not alias_lang:
                        alias_lang = 'UNK'
                    if alias_lang == lang and lang != 'UNK':
                        alias_language = None
                    else:
                        alias_language = self.language_codes[alias_lang]

                    language_places.setdefault(alias_language,
                                               defaultdict(set))
                    lang_places = language_places[alias_language]

                    name = self.cleanup_name(name)

                    lang_places[place_type].add(name)

                place_types_seen.add(place_type)

            default_city_names = set([
                name.lower() for name in language_places.get(None, {}).get(
                    AddressFormatter.CITY, [])
            ])

            for language, containing_places in six.iteritems(language_places):
                if language is None:
                    language = original_language

                country_localized_name = country_names.localized_name(
                    country, language)
                if country_localized_name:
                    containing_places[AddressFormatter.COUNTRY].add(
                        country_localized_name)
                country_alpha3_code = country_names.alpha3_code(country)
                if country_alpha3_code and language in (None, 'ENG'):
                    containing_places[AddressFormatter.COUNTRY].add(
                        country_alpha3_code)

                keys = containing_places.keys()
                all_values = containing_places.values()

                keys_set = set(keys)

                for i, values in enumerate(itertools.product(*all_values)):
                    components = {AddressFormatter.POSTCODE: postal_code}

                    if not default_city_names:
                        components.update(zip(keys, values))
                    else:
                        for k, v in zip(keys, values):
                            if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower(
                            ) not in default_city_names:
                                components[k] = v

                    format_language = language if self.formatter.template_language_matters(
                        country, language) else None
                    formatted = self.formatter.format_address(
                        components,
                        country,
                        language=format_language,
                        minimal_only=False,
                        tag_components=tag_components)

                    yield (language, country, formatted)

                    component_keys = set(components)
                    components = place_config.dropout_components(
                        components, (), country=country, population=0)

                    if len(components) > 1 and set(
                            components) ^ component_keys:
                        formatted = self.formatter.format_address(
                            components,
                            country,
                            language=format_language,
                            minimal_only=False,
                            tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self, out_dir, tag_components=True):
        if tag_components:
            formatted_tagged_file = open(
                os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME),
                'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(
                os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for language, country, formatted_address in self.format_postal_codes(
                tag_components=tag_components):
            if not formatted_address or not formatted_address.strip():
                continue

            formatted_address = tsv_string(formatted_address)
            if not formatted_address or not formatted_address.strip():
                continue

            if tag_components:
                row = (language, country, formatted_address)
            else:
                row = (formatted_address, )

            writer.writerow(row)
            i += 1
            if i % 1000 == 0 and i > 0:
                print('did {} formatted addresses'.format(i))
コード例 #12
0
    def __init__(self, geoplanet_db):
        self.db = sqlite3.connect(geoplanet_db)

        # These aren't too large and it's easier to have them in memory
        self.places = {row[0]: row[1:] for row in self.db.execute('select * from places')}
        self.aliases = defaultdict(list)

        self.coterminous_admins = {}
        self.admins_with_ambiguous_city = set()

        print('Doing admin ambiguities')
        for row in self.db.execute('''select p.id,
                                             (select count(*) from places where parent_id = p.id) as num_places,
                                             (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
                                             p2.id
                                      from places p
                                      join places p2
                                          on p2.parent_id = p.id
                                          and p.name = p2.name
                                          and p.place_type != "Town"
                                          and p2.place_type = "Town"
                                      group by p.id'''):
            place_id, num_places, num_towns, coterminous_town_id = row
            num_places = int(num_places)
            num_towns = int(num_towns)

            if num_places == 1 and num_towns == 1:
                self.coterminous_admins[place_id] = coterminous_town_id
            self.admins_with_ambiguous_city.add(place_id)

        print('num coterminous: {}'.format(len(self.coterminous_admins)))
        print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))

        print('Doing aliases')
        for row in self.db.execute('''select a.* from aliases a
                                      left join places p
                                          on a.id = p.id
                                          and p.place_type in ("State", "County")
                                          and a.language != p.language
                                      where name_type != "S" -- no colloquial aliases like "The Big Apple"
                                      and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
                                      and p.id is NULL -- exclude foreign-language states/county names
                                      order by id, language,
                                      case name_type
                                          when "P" then 1
                                          when "Q" then 2
                                          when "V" then 3
                                          when "A" then 4
                                          when "S" then 5
                                          else 6
                                      end'''):
            place = self.places.get(row[0])
            if not place:
                continue

            self.aliases[row[0]].append(row[1:])

        print('Doing variant aliases')
        variant_aliases = 0
        for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a
                                                   join places p using(id)
                                                   where a.name_type = "V"
                                                   and a.language = p.language''')):
            place_name, country_code = row[-2:]
            country = country_code.lower()

            row = row[:-2]
            place_id, alias, name_type, language = row

            language = self.language_codes[language]
            if language != 'unk':
                alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
                if alias_sans_affixes:
                    alias = alias_sans_affixes

                place_name_sans_affixes = name_affixes.replace_affixes(place_name, language, country=country)
                if place_name_sans_affixes:
                    place_name = place_name_sans_affixes
            else:
                language = None

            if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language):
                self.aliases[row[0]].append(row[1:])
                variant_aliases += 1

            if i % 10000 == 0 and i > 0:
                print('tested {} variant aliases with {} positives'.format(i, variant_aliases))

        self.aliases = dict(self.aliases)

        self.formatter = AddressFormatter()
コード例 #13
0
class GeoPlanetFormatter(object):
    # Map of GeoPlanet language codes to ISO-639 alpha2 language codes
    language_codes = {
        'ENG': 'en',
        'JPN': 'ja',
        'GER': 'de',
        'SPA': 'es',
        'FRE': 'fr',
        'UNK': 'unk',
        'ITA': 'it',
        'POR': 'pt',
        'POL': 'pl',
        'ARA': 'ar',
        'CZE': 'cs',
        'SWE': 'sv',
        'CHI': 'zh',
        'RUM': 'ro',
        'FIN': 'fi',
        'DUT': 'nl',
        'NOR': 'nb',
        'DAN': 'da',
        'HUN': 'hu',
        'KOR': 'kr',
    }

    non_latin_script_languages = {
        'JPN',  # Japanese
        'ARA',  # Arabic
        'CHI',  # Chinese
        'KOR',  # Korean
    }

    ALIAS_PREFERRED = 'P'
    ALIAS_PREFERRED_FOREIGN = 'Q'
    ALIAS_VARIANT = 'V'
    ALIAS_ABBREVIATED = 'A'
    ALIAS_COLLOQUIAL = 'S'

    # Map of GeoPlanet place types to address formatter types
    place_types = {
        'Continent': AddressFormatter.WORLD_REGION,
        'Country': AddressFormatter.COUNTRY,
        'CountryRegion': AddressFormatter.COUNTRY_REGION,
        'State': AddressFormatter.STATE,
        'County': AddressFormatter.STATE_DISTRICT,
        'Island': AddressFormatter.ISLAND,
        'Town': AddressFormatter.CITY,
        # Note: if we do general place queris from GeoPlanet, this
        # may have to be mapped more carefully
        'LocalAdmin': AddressFormatter.CITY_DISTRICT,
        'Suburb': AddressFormatter.SUBURB,
    }

    def __init__(self, geoplanet_db):
        self.db = sqlite3.connect(geoplanet_db)

        # These aren't too large and it's easier to have them in memory
        self.places = {row[0]: row[1:] for row in self.db.execute('select * from places')}
        self.aliases = defaultdict(list)

        self.coterminous_admins = {}
        self.admins_with_ambiguous_city = set()

        print('Doing admin ambiguities')
        for row in self.db.execute('''select p.id,
                                             (select count(*) from places where parent_id = p.id) as num_places,
                                             (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
                                             p2.id
                                      from places p
                                      join places p2
                                          on p2.parent_id = p.id
                                          and p.name = p2.name
                                          and p.place_type != "Town"
                                          and p2.place_type = "Town"
                                      group by p.id'''):
            place_id, num_places, num_towns, coterminous_town_id = row
            num_places = int(num_places)
            num_towns = int(num_towns)

            if num_places == 1 and num_towns == 1:
                self.coterminous_admins[place_id] = coterminous_town_id
            self.admins_with_ambiguous_city.add(place_id)

        print('num coterminous: {}'.format(len(self.coterminous_admins)))
        print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))

        print('Doing aliases')
        for row in self.db.execute('''select a.* from aliases a
                                      left join places p
                                          on a.id = p.id
                                          and p.place_type in ("State", "County")
                                          and a.language != p.language
                                      where name_type != "S" -- no colloquial aliases like "The Big Apple"
                                      and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
                                      and p.id is NULL -- exclude foreign-language states/county names
                                      order by id, language,
                                      case name_type
                                          when "P" then 1
                                          when "Q" then 2
                                          when "V" then 3
                                          when "A" then 4
                                          when "S" then 5
                                          else 6
                                      end'''):
            place = self.places.get(row[0])
            if not place:
                continue

            self.aliases[row[0]].append(row[1:])

        print('Doing variant aliases')
        variant_aliases = 0
        for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a
                                                   join places p using(id)
                                                   where a.name_type = "V"
                                                   and a.language = p.language''')):
            place_name, country_code = row[-2:]
            country = country_code.lower()

            row = row[:-2]
            place_id, alias, name_type, language = row

            language = self.language_codes[language]
            if language != 'unk':
                alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
                if alias_sans_affixes:
                    alias = alias_sans_affixes

                place_name_sans_affixes = name_affixes.replace_affixes(place_name, language, country=country)
                if place_name_sans_affixes:
                    place_name = place_name_sans_affixes
            else:
                language = None

            if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language):
                self.aliases[row[0]].append(row[1:])
                variant_aliases += 1

            if i % 10000 == 0 and i > 0:
                print('tested {} variant aliases with {} positives'.format(i, variant_aliases))

        self.aliases = dict(self.aliases)

        self.formatter = AddressFormatter()

    def get_place_hierarchy(self, place_id):
        all_places = []
        original_place_id = place_id
        place = self.places[place_id]
        all_places.append((place_id, ) + place)
        place_id = place[-1]
        while place_id != 1 and place_id != original_place_id:
            place = self.places[place_id]
            all_places.append((place_id,) + place)
            place_id = place[-1]
        return all_places

    def get_aliases(self, place_id):
        return self.aliases.get(place_id, [])

    def cleanup_name(self, name):
        return name.strip(' ,-')

    def format_postal_codes(self, tag_components=True):
        all_postal_codes = self.db.execute('select * from postal_codes')
        for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes:
            country = country.lower()
            postcode_language = language

            language = self.language_codes[language]

            if len(postal_code) <= 3:
                postcode_regex = postcode_regexes.get(country)

                valid_postcode = False
                if postcode_regex:
                    match = postcode_regex.match(postal_code)
                    if match and match.end() == len(postal_code):
                        valid_postcode = True

                if not valid_postcode:
                    continue

            # If the county/state is coterminous with a city and contains only one place,
            # set the parent_id to the city instead
            if parent_id in self.coterminous_admins:
                parent_id = self.coterminous_admins[parent_id]

            place_hierarchy = self.get_place_hierarchy(parent_id)

            containing_places = defaultdict(set)

            language_places = {None: containing_places}

            original_language = language

            have_default_language = False

            if place_hierarchy:
                base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0]
                base_place_type = self.place_types[base_place_type]
            else:
                base_place_id = None
                base_place_type = None

            place_types_seen = set()

            for place_id, country, name, lang, place_type, parent in place_hierarchy:
                country = country.lower()

                # First language
                if not have_default_language and lang != postcode_language:
                    language = self.language_codes[lang]
                    have_default_language = True

                place_type = self.place_types[place_type]
                if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city:
                    continue

                name = self.cleanup_name(name)
                containing_places[place_type].add(name)

                aliases = self.get_aliases(place_id)
                for name, name_type, alias_lang in aliases:
                    if not alias_lang:
                        alias_lang = 'UNK'
                    if alias_lang == lang and lang != 'UNK':
                        alias_language = None
                    else:
                        alias_language = self.language_codes[alias_lang]

                    language_places.setdefault(alias_language, defaultdict(set))
                    lang_places = language_places[alias_language]

                    name = self.cleanup_name(name)

                    lang_places[place_type].add(name)

                place_types_seen.add(place_type)

            default_city_names = set([name.lower() for name in language_places.get(None, {}).get(AddressFormatter.CITY, [])])

            for language, containing_places in six.iteritems(language_places):
                if language is None:
                    language = original_language

                country_localized_name = country_names.localized_name(country, language)
                if country_localized_name:
                    containing_places[AddressFormatter.COUNTRY].add(country_localized_name)
                country_alpha3_code = country_names.alpha3_code(country)
                if country_alpha3_code and language in (None, 'ENG'):
                    containing_places[AddressFormatter.COUNTRY].add(country_alpha3_code)

                keys = containing_places.keys()
                all_values = containing_places.values()

                keys_set = set(keys)

                for i, values in enumerate(itertools.product(*all_values)):
                    components = {
                        AddressFormatter.POSTCODE: postal_code
                    }

                    if not default_city_names:
                        components.update(zip(keys, values))
                    else:
                        for k, v in zip(keys, values):
                            if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower() not in default_city_names:
                                components[k] = v

                    format_language = language if self.formatter.template_language_matters(country, language) else None
                    formatted = self.formatter.format_address(components, country, language=format_language,
                                                              minimal_only=False, tag_components=tag_components)

                    yield (language, country, formatted)

                    component_keys = set(components)
                    components = place_config.dropout_components(components, (), country=country, population=0)

                    if len(components) > 1 and set(components) ^ component_keys:
                        formatted = self.formatter.format_address(components, country, language=format_language,
                                                                  minimal_only=False, tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self, out_dir, tag_components=True):
        if tag_components:
            formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for language, country, formatted_address in self.format_postal_codes(tag_components=tag_components):
            if not formatted_address or not formatted_address.strip():
                continue

            formatted_address = tsv_string(formatted_address)
            if not formatted_address or not formatted_address.strip():
                continue

            if tag_components:
                row = (language, country, formatted_address)
            else:
                row = (formatted_address,)

            writer.writerow(row)
            i += 1
            if i % 1000 == 0 and i > 0:
                print('did {} formatted addresses'.format(i))
コード例 #14
0
 def __init__(self):
     self.formatter = AddressFormatter()
コード例 #15
0
class OpenAddressesUKFormatter(object):
    field_map = {
        'pao': AddressFormatter.HOUSE_NUMBER,
        'street.name': AddressFormatter.ROAD,
        'town.name': AddressFormatter.CITY,
        'postcode.name': AddressFormatter.POSTCODE
    }

    def __init__(self):
        self.formatter = AddressFormatter()

    component_validators = {
        AddressFormatter.HOUSE_NUMBER:
        OpenAddressesFormatter.validators.validate_house_number,
        AddressFormatter.ROAD:
        OpenAddressesFormatter.validators.validate_street,
        AddressFormatter.POSTCODE:
        OpenAddressesFormatter.validators.validate_postcode,
    }

    cldr_country_probability = 0.3
    address_only_probability = 0.4
    drop_address_probability = 0.6
    drop_address_and_postcode_probability = 0.1

    @classmethod
    def cleanup_number(cls, num, strip_commas=False):
        num = num.strip()
        if strip_commas:
            num = num.replace(six.u(','), six.u(''))
        try:
            num_int = int(num)
        except (ValueError, TypeError):
            try:
                num_float = float(num)
                leading_zeros = 0
                for c in num:
                    if c == six.u('0'):
                        leading_zeros += 1
                    else:
                        break
                num = safe_decode(int(num_float))
                if leading_zeros:
                    num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
            except (ValueError, TypeError):
                pass
        return num

    def fix_component_encodings(self, components):
        return {
            k: ftfy.fix_encoding(safe_decode(v))
            for k, v in six.iteritems(components)
        }

    def formatted_addresses(self, path, tag_components=True):
        country = Countries.UNITED_KINGDOM
        candidate_languages = get_country_languages(country).items()

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {
            i: self.field_map[k]
            for i, k in enumerate(headers) if k in self.field_map
        }

        for row in reader:
            components = {}

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value:
                    continue

                if not_applicable_regex.match(value) or null_regex.match(
                        value) or unknown_regex.match(value):
                    continue

                value = value.strip(', -')

                validator = self.component_validators.get(key, None)

                if validator is not None and not validator(value):
                    continue

                if value:
                    components[key] = value

            if components:
                components = self.fix_component_encodings(components)

                language = AddressComponents.address_language(
                    components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)
                    if AddressComponents.street_name_is_valid(street):

                        street = abbreviate(street_types_gazetteer, street,
                                            language)
                        components[AddressFormatter.ROAD] = street
                    else:
                        components.pop(AddressFormatter.ROAD)
                        street = None

                house_number = components.get(AddressFormatter.HOUSE_NUMBER,
                                              None)
                if house_number:
                    house_number = self.cleanup_number(house_number,
                                                       strip_commas=True)

                    if house_number is not None:
                        components[
                            AddressFormatter.HOUSE_NUMBER] = house_number

                postcode = components.get(AddressFormatter.POSTCODE, None)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if not street or (street and house_number and
                                  (street.lower() == house_number.lower())):
                    if not postcode:
                        continue
                    components = AddressComponents.drop_address(components)

                country_name = AddressComponents.cldr_country_name(
                    country, language)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(toponym_abbreviations_gazetteer,
                                               component, language)
                        component = AddressComponents.name_hyphens(component)
                        components[component_key] = component

                AddressComponents.replace_names(components)

                AddressComponents.prune_duplicate_names(components)

                AddressComponents.remove_numeric_boundary_names(components)
                AddressComponents.add_house_number_phrase(components,
                                                          language,
                                                          country=country)

                # Component dropout
                components = place_config.dropout_components(components,
                                                             country=country)

                formatted = self.formatter.format_address(
                    components,
                    country,
                    language=language,
                    minimal_only=False,
                    tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < self.address_only_probability and street:
                    address_only_components = AddressComponents.drop_places(
                        components)
                    address_only_components = AddressComponents.drop_postcode(
                        address_only_components)
                    formatted = self.formatter.format_address(
                        address_only_components,
                        country,
                        language=language,
                        minimal_only=False,
                        tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < self.drop_address_probability:
                    components = AddressComponents.drop_address(components)

                    if rand_val < self.drop_address_and_postcode_probability:
                        components = AddressComponents.drop_postcode(
                            components)

                    if components and (len(components) > 1):
                        formatted = self.formatter.format_address(
                            components,
                            country,
                            language=language,
                            minimal_only=False,
                            tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self, infile, out_dir, tag_components=True):
        if tag_components:
            formatted_tagged_file = open(
                os.path.join(out_dir,
                             OPENADDRESSES_UK_FORMAT_DATA_TAGGED_FILENAME),
                'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(
                os.path.join(out_dir, OPENADDRESSES_UK_FORMAT_DATA_FILENAME),
                'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for language, country, formatted_address in self.formatted_addresses(
                infile, tag_components=tag_components):
            if tag_components:
                row = (language, country, formatted_address)
            else:
                row = (formatted_address, )

            writer.writerow(row)
            i += 1
            if i % 1000 == 0 and i > 0:
                print('did {} formatted addresses'.format(i))