def test_countries(self):
        for s, country, expected in country_test_cases:
            languages = get_country_languages(country)
            self.assertTrue(bool(languages))

            lang = disambiguate_language(s, languages.items())
            self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, languages.items()))
    def test_countries(self):
        for s, country, expected in country_test_cases:
            languages = get_country_languages(country)
            self.assertTrue(bool(languages))

            lang = disambiguate_language(s, languages.items())
            self.assertEqual(
                lang, expected,
                '{} != {} for {}, langs={}'.format(lang, expected, s,
                                                   languages.items()))
    def test_regional(self):
        for s, country, k, v, expected in regional_test_cases:
            languages = get_country_languages(country)
            self.assertTrue(bool(languages))
            regional = get_regional_languages(country, k, v)
            self.assertTrue(bool(regional))
            regional.update(languages)

            lang = disambiguate_language(s, regional.items())

            self.assertEqual(lang, expected, '{} != {} for {}, langs={}'.format(lang, expected, s, regional.items()))
    def test_regional(self):
        for s, country, k, v, expected in regional_test_cases:
            languages = get_country_languages(country)
            self.assertTrue(bool(languages))
            regional = get_regional_languages(country, k, v)
            self.assertTrue(bool(regional))
            regional.update(languages)

            lang = disambiguate_language(s, regional.items())

            self.assertEqual(
                lang, expected,
                '{} != {} for {}, langs={}'.format(lang, expected, s,
                                                   regional.items()))
Esempio n. 5
0
    def formatted_addresses(self,
                            country_dir,
                            path,
                            configs,
                            tag_components=True):
        abbreviate_street_prob = float(
            self.get_property('abbreviate_street_probability', *configs))
        separate_street_prob = float(
            self.get_property('separate_street_probability', *configs) or 0.0)
        abbreviate_unit_prob = float(
            self.get_property('abbreviate_unit_probability', *configs))
        separate_unit_prob = float(
            self.get_property('separate_unit_probability', *configs) or 0.0)
        abbreviate_toponym_prob = float(
            self.get_property('abbreviate_toponym_probability', *configs))

        add_osm_boundaries = bool(
            self.get_property('add_osm_boundaries', *configs) or False)
        add_osm_neighborhoods = bool(
            self.get_property('add_osm_neighborhoods', *configs) or False)
        osm_neighborhood_overrides_city = self.get_property(
            'osm_neighborhood_overrides_city', *configs)
        non_numeric_units = bool(
            self.get_property('non_numeric_units', *configs) or False)
        house_number_strip_commas = bool(
            self.get_property('house_number_strip_commas', *configs) or False)
        numeric_postcodes_only = bool(
            self.get_property('numeric_postcodes_only', *configs) or False)
        postcode_strip_non_digit_chars = bool(
            self.get_property('postcode_strip_non_digit_chars', *configs)
            or False)

        address_only_probability = float(
            self.get_property('address_only_probability', *configs))
        place_only_probability = float(
            self.get_property('place_only_probability', *configs))
        place_and_postcode_probability = float(
            self.get_property('place_and_postcode_probability', *configs))

        city_replacements = self.get_property('city_replacements', *configs)

        override_country_dir = self.get_property('override_country_dir',
                                                 *configs)

        postcode_length = int(
            self.get_property('postcode_length', *configs) or 0)

        drop_address_probability = place_only_probability + place_and_postcode_probability

        ignore_rows_missing_fields = set(
            self.get_property('ignore_rows_missing_fields', *configs) or [])

        ignore_fields_containing = {
            field: re.compile(
                six.u('|').join(
                    [six.u('(?:{})').format(safe_decode(v)) for v in value]),
                re.I | re.UNICODE)
            for field, value in six.iteritems(
                dict(
                    self.get_property('ignore_fields_containing', *configs)
                    or {}))
        }

        alias_fields_containing = {
            field:
            [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
            for field, value in six.iteritems(
                dict(
                    self.get_property('alias_fields_containing', *configs)
                    or {}))
        }

        config_language = self.get_property('language', *configs)

        add_components = self.get_property('add', *configs)

        fields = self.get_property('fields', *configs)
        if not fields:
            return

        field_map = {
            field_name: f['component']
            for field_name, f in six.iteritems(fields)
        }
        mapped_values = {
            f['component']: f['value_map']
            for f in six.itervalues(fields)
            if hasattr(f.get('value_map'), 'get')
        }

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {
            i: field_map[k]
            for i, k in enumerate(headers) if k in field_map
        }
        latitude_index = headers.index('LAT')
        longitude_index = headers.index('LON')

        # Clear cached polygons
        self.components.osm_admin_rtree.clear_cache()
        self.components.neighborhoods_rtree.clear_cache()

        for row in reader:
            try:
                latitude = float(row[latitude_index])
                longitude = float(row[longitude_index])
            except (ValueError, TypeError):
                continue

            language = config_language

            components = {}

            skip_record = False

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value and key in ignore_rows_missing_fields:
                    skip_record = True
                    break
                elif not value:
                    continue

                if key in mapped_values:
                    value = mapped_values[key].get(value, value)

                if key == AddressFormatter.ROAD and language == SPANISH:
                    value = self.components.spanish_street_name(value)

                if key == AddressFormatter.POSTCODE:
                    value = self.cleanup_number(value)

                    if postcode_strip_non_digit_chars:
                        value = six.u('').join(
                            (c for c in value if c.isdigit()))

                    if value and not is_numeric(
                            value) and numeric_postcodes_only:
                        continue
                    else:
                        if postcode_length:
                            value = value.zfill(
                                postcode_length)[:postcode_length]

                if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
                    if add_osm_boundaries:
                        continue
                    value = self.components.cleaned_name(
                        value, first_comma_delimited_phrase=True)
                    if value and ((len(value) < 2
                                   and not get_string_script(value)[0].lower()
                                   in ideographic_scripts)
                                  or is_numeric(value)):
                        continue

                if not_applicable_regex.match(value) or null_regex.match(
                        value) or unknown_regex.match(value):
                    continue

                for exp, sub_val in self.field_regex_replacements.get(key, []):
                    value = exp.sub(sub_val, value)

                for exp, sub_val in self.field_regex_replacements.get(
                        None, []):
                    value = exp.sub(sub_val, value)

                value = value.strip(', -')

                validator = self.country_validators.get(country_dir, {}).get(
                    key,
                    self.language_validators.get(language, {}).get(
                        key, self.component_validators.get(key, None)))

                if validator is not None and not validator(value):
                    continue

                if key in ignore_fields_containing and ignore_fields_containing[
                        key].search(value):
                    continue

                for (pattern, alias) in alias_fields_containing.get(key, []):
                    if pattern.search(value):
                        if 'component' in alias:
                            key = alias['component']

                if value:
                    components[key] = value

            if skip_record:
                continue

            if components:
                country, candidate_languages = self.country_rtree.country_and_languages(
                    latitude, longitude)
                if not (country and candidate_languages) or (
                        country != country_dir and not override_country_dir):
                    country = country_dir
                    candidate_languages = get_country_languages(country)
                    if not candidate_languages:
                        continue
                    candidate_languages = candidate_languages.items()

                components = self.fix_component_encodings(components)

                if language is None:
                    language = AddressComponents.address_language(
                        components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)

                    if language == UNKNOWN_LANGUAGE:
                        strip_unit_language = candidate_languages[0][
                            0] if candidate_languages else None
                    else:
                        strip_unit_language = language

                    street = self.components.strip_unit_phrases_for_language(
                        street, strip_unit_language)

                    street = abbreviate(street_types_gazetteer,
                                        street,
                                        language,
                                        abbreviate_prob=abbreviate_street_prob,
                                        separate_prob=separate_street_prob)
                    components[AddressFormatter.ROAD] = street

                house_number = components.get(AddressFormatter.HOUSE_NUMBER,
                                              None)
                if house_number:
                    house_number = self.cleanup_number(
                        house_number, strip_commas=house_number_strip_commas)

                    if language == CHINESE:
                        house_number = self.format_chinese_house_number(
                            house_number)

                    if country_dir == Countries.COLOMBIA:
                        house_number = self.format_colombian_house_number(
                            house_number)

                    if house_number is not None:
                        components[
                            AddressFormatter.HOUSE_NUMBER] = house_number

                unit = components.get(AddressFormatter.UNIT, None)

                street_required = country not in (
                    Countries.JAPAN, Countries.CZECH_REPUBLIC
                ) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES

                postcode = components.get(AddressFormatter.POSTCODE, None)

                if postcode:
                    components[AddressFormatter.
                               POSTCODE] = PostalCodes.add_country_code(
                                   postcode, country)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if (not street and street_required) or (
                        street and house_number and
                    (street.lower() == house_number.lower())) or (
                        unit and street and street.lower() == unit.lower()):
                    if not postcode:
                        continue
                    components = self.components.drop_address(components)

                # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
                unit = components.get(AddressFormatter.UNIT, None)

                if unit is not None:
                    if is_numeric_strict(unit):
                        unit = Unit.phrase(unit, language, country=country)
                    elif non_numeric_units:
                        unit = abbreviate(unit_types_gazetteer,
                                          unit,
                                          language,
                                          abbreviate_prob=abbreviate_unit_prob,
                                          separate_prob=separate_unit_prob)
                    else:
                        unit = None

                    if unit is not None:
                        components[AddressFormatter.UNIT] = unit
                    else:
                        components.pop(AddressFormatter.UNIT)
                        unit = None

                # CLDR country name
                country_name = self.cldr_country_name(country, language,
                                                      configs)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(
                            toponym_abbreviations_gazetteer,
                            component,
                            language,
                            abbreviate_prob=abbreviate_toponym_prob)
                        component = self.components.name_hyphens(component)
                        components[component_key] = component

                # Any components specified to be added by the config (usually state)
                if add_components:
                    for k, v in six.iteritems(add_components):
                        if k not in components:
                            components[k] = v

                # Get named states occasionally, added component is usually a state code
                address_state = self.components.state_name(
                    components, country, language)
                if address_state:
                    components[AddressFormatter.STATE] = address_state

                state = components.get(AddressFormatter.STATE)
                if state:
                    state = self.components.abbreviated_state(
                        state, country, language)
                    if state:
                        components[AddressFormatter.STATE] = state

                # This is expensive, so only turn on for files that don't supply their own city names
                # or for which those names are flawed
                osm_components = []

                # Using population=0 instead of None means if there's no known population or
                # we don't need to add OSM components, we assume the population of the town is
                # very small and the place name shouldn't be used unqualified (i.e. needs information
                # like state name to disambiguate it)
                population = 0
                unambiguous_city = False
                if add_osm_boundaries or AddressFormatter.CITY not in components:
                    osm_components = self.components.osm_reverse_geocoded_components(
                        latitude, longitude)
                    self.components.add_admin_boundaries(
                        components, osm_components, country, language,
                        latitude, longitude)
                    categorized = self.components.categorized_osm_components(
                        country, osm_components)
                    for component, label in categorized:
                        if label == AddressFormatter.CITY:
                            unambiguous_city = self.components.unambiguous_wikipedia(
                                component, language)
                            if 'population' in component:
                                population = component['population']
                            break

                if AddressFormatter.CITY not in components and city_replacements:
                    components.update({
                        k: v
                        for k, v in six.iteritems(city_replacements)
                        if k not in components
                    })

                # The neighborhood index is cheaper so can turn on for whole countries
                neighborhood_components = []
                if add_osm_neighborhoods:
                    neighborhood_components = self.components.neighborhood_components(
                        latitude, longitude)
                    self.components.add_neighborhoods(
                        components,
                        neighborhood_components,
                        country,
                        language,
                        replace_city=osm_neighborhood_overrides_city)

                self.components.cleanup_boundary_names(components)
                self.components.country_specific_cleanup(components, country)

                self.components.replace_name_affixes(components,
                                                     language,
                                                     country=country)

                self.components.replace_names(components)

                self.components.prune_duplicate_names(components)

                self.components.remove_numeric_boundary_names(components)
                self.components.add_house_number_phrase(components,
                                                        language,
                                                        country=country)
                self.components.add_postcode_phrase(components,
                                                    language,
                                                    country=country)

                # Component dropout
                all_osm_components = osm_components + neighborhood_components
                components = place_config.dropout_components(
                    components,
                    all_osm_components,
                    country=country,
                    population=population,
                    unambiguous_city=unambiguous_city)

                self.components.add_genitives(components, language)

                formatted = self.formatter.format_address(
                    components,
                    country,
                    language=language,
                    minimal_only=False,
                    tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < address_only_probability and street:
                    address_only_components = self.components.drop_places(
                        components)
                    address_only_components = self.components.drop_postcode(
                        address_only_components)
                    formatted = self.formatter.format_address(
                        address_only_components,
                        country,
                        language=language,
                        minimal_only=False,
                        tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < drop_address_probability:
                    components = self.components.drop_address(components)

                    if rand_val < place_and_postcode_probability:
                        components = self.components.drop_postcode(components)

                    if components and (len(components) > 1
                                       or add_osm_boundaries):
                        formatted = self.formatter.format_address(
                            components,
                            country,
                            language=language,
                            minimal_only=False,
                            tag_components=tag_components)
                        yield (language, country, formatted)
Esempio n. 6
0
    def formatted_addresses(self, country_dir, path, configs, tag_components=True):
        abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
        separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
        abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
        separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
        abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs))

        add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False)
        add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False)
        osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs)
        non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False)
        house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False)
        numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
        postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)

        address_only_probability = float(self.get_property('address_only_probability', *configs))
        place_only_probability = float(self.get_property('place_only_probability', *configs))
        place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs))

        city_replacements = self.get_property('city_replacements', *configs)

        override_country_dir = self.get_property('override_country_dir', *configs)

        postcode_length = int(self.get_property('postcode_length', *configs) or 0)

        drop_address_probability = place_only_probability + place_and_postcode_probability

        ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])

        ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
                                    for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))}

        alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
                                   for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))}

        config_language = self.get_property('language', *configs)

        add_components = self.get_property('add', *configs)

        fields = self.get_property('fields', *configs)
        if not fields:
            return

        field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)}
        mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')}

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map}
        latitude_index = headers.index('LAT')
        longitude_index = headers.index('LON')

        # Clear cached polygons
        self.components.osm_admin_rtree.clear_cache()
        self.components.neighborhoods_rtree.clear_cache()

        for row in reader:
            try:
                latitude = float(row[latitude_index])
                longitude = float(row[longitude_index])
            except (ValueError, TypeError):
                continue

            language = config_language

            components = {}

            skip_record = False

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value and key in ignore_rows_missing_fields:
                    skip_record = True
                    break
                elif not value:
                    continue

                if key in mapped_values:
                    value = mapped_values[key].get(value, value)

                if key == AddressFormatter.ROAD and language == SPANISH:
                    value = self.components.spanish_street_name(value)

                if key == AddressFormatter.POSTCODE:
                    value = self.cleanup_number(value)

                    if postcode_strip_non_digit_chars:
                        value = six.u('').join((c for c in value if c.isdigit()))

                    if value and not is_numeric(value) and numeric_postcodes_only:
                        continue
                    else:
                        if postcode_length:
                            value = value.zfill(postcode_length)[:postcode_length]

                if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
                    if add_osm_boundaries:
                        continue
                    value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
                    if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)):
                        continue

                if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
                    continue

                for exp, sub_val in self.field_regex_replacements.get(key, []):
                    value = exp.sub(sub_val, value)

                for exp, sub_val in self.field_regex_replacements.get(None, []):
                    value = exp.sub(sub_val, value)

                value = value.strip(', -')

                validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None)))

                if validator is not None and not validator(value):
                    continue

                if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
                    continue

                for (pattern, alias) in alias_fields_containing.get(key, []):
                    if pattern.search(value):
                        if 'component' in alias:
                            key = alias['component']

                if value:
                    components[key] = value

            if skip_record:
                continue

            if components:
                country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
                if not (country and candidate_languages) or (country != country_dir and not override_country_dir):
                    country = country_dir
                    candidate_languages = get_country_languages(country)
                    if not candidate_languages:
                        continue
                    candidate_languages = candidate_languages.items()

                components = self.fix_component_encodings(components)

                if language is None:
                    language = AddressComponents.address_language(components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)

                    if language == UNKNOWN_LANGUAGE:
                        strip_unit_language = candidate_languages[0][0] if candidate_languages else None
                    else:
                        strip_unit_language = language

                    street = self.components.strip_unit_phrases_for_language(street, strip_unit_language)

                    street = abbreviate(street_types_gazetteer, street, language,
                                        abbreviate_prob=abbreviate_street_prob,
                                        separate_prob=separate_street_prob)
                    components[AddressFormatter.ROAD] = street

                house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
                if house_number:
                    house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)

                    if language == CHINESE:
                        house_number = self.format_chinese_house_number(house_number)

                    if country_dir == Countries.COLOMBIA:
                        house_number = self.format_colombian_house_number(house_number)

                    if house_number is not None:
                        components[AddressFormatter.HOUSE_NUMBER] = house_number

                unit = components.get(AddressFormatter.UNIT, None)

                street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES

                postcode = components.get(AddressFormatter.POSTCODE, None)

                if postcode:
                    components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()):
                    if not postcode:
                        continue
                    components = self.components.drop_address(components)

                # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
                unit = components.get(AddressFormatter.UNIT, None)

                if unit is not None:
                    if is_numeric_strict(unit):
                        unit = Unit.phrase(unit, language, country=country)
                    elif non_numeric_units:
                        unit = abbreviate(unit_types_gazetteer, unit, language,
                                          abbreviate_prob=abbreviate_unit_prob,
                                          separate_prob=separate_unit_prob)
                    else:
                        unit = None

                    if unit is not None:
                        components[AddressFormatter.UNIT] = unit
                    else:
                        components.pop(AddressFormatter.UNIT)
                        unit = None

                # CLDR country name
                country_name = self.cldr_country_name(country, language, configs)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(toponym_abbreviations_gazetteer, component, language,
                                               abbreviate_prob=abbreviate_toponym_prob)
                        component = self.components.name_hyphens(component)
                        components[component_key] = component

                # Any components specified to be added by the config (usually state)
                if add_components:
                    for k, v in six.iteritems(add_components):
                        if k not in components:
                            components[k] = v

                # Get named states occasionally, added component is usually a state code
                address_state = self.components.state_name(components, country, language)
                if address_state:
                    components[AddressFormatter.STATE] = address_state

                state = components.get(AddressFormatter.STATE)
                if state:
                    state = self.components.abbreviated_state(state, country, language)
                    if state:
                        components[AddressFormatter.STATE] = state

                # This is expensive, so only turn on for files that don't supply their own city names
                # or for which those names are flawed
                osm_components = []

                # Using population=0 instead of None means if there's no known population or
                # we don't need to add OSM components, we assume the population of the town is
                # very small and the place name shouldn't be used unqualified (i.e. needs information
                # like state name to disambiguate it)
                population = 0
                unambiguous_city = False
                if add_osm_boundaries or AddressFormatter.CITY not in components:
                    osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
                    self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude)
                    categorized = self.components.categorized_osm_components(country, osm_components)
                    for component, label in categorized:
                        if label == AddressFormatter.CITY:
                            unambiguous_city = self.components.unambiguous_wikipedia(component, language)
                            if 'population' in component:
                                population = component['population']
                            break

                if AddressFormatter.CITY not in components and city_replacements:
                    components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components})

                # The neighborhood index is cheaper so can turn on for whole countries
                neighborhood_components = []
                if add_osm_neighborhoods:
                    neighborhood_components = self.components.neighborhood_components(latitude, longitude)
                    self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city)

                self.components.cleanup_boundary_names(components)
                self.components.country_specific_cleanup(components, country)

                self.components.replace_name_affixes(components, language, country=country)

                self.components.replace_names(components)

                self.components.prune_duplicate_names(components)

                self.components.remove_numeric_boundary_names(components)
                self.components.add_house_number_phrase(components, language, country=country)
                self.components.add_postcode_phrase(components, language, country=country)

                # Component dropout
                all_osm_components = osm_components + neighborhood_components
                components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)

                self.components.add_genitives(components, language)

                formatted = self.formatter.format_address(components, country, language=language,
                                                          minimal_only=False, tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < address_only_probability and street:
                    address_only_components = self.components.drop_places(components)
                    address_only_components = self.components.drop_postcode(address_only_components)
                    formatted = self.formatter.format_address(address_only_components, country, language=language,
                                                              minimal_only=False, tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < drop_address_probability:
                    components = self.components.drop_address(components)

                    if rand_val < place_and_postcode_probability:
                        components = self.components.drop_postcode(components)

                    if components and (len(components) > 1 or add_osm_boundaries):
                        formatted = self.formatter.format_address(components, country, language=language,
                                                                  minimal_only=False, tag_components=tag_components)
                        yield (language, country, formatted)
    def formatted_addresses(self, path, tag_components=True):
        country = Countries.UNITED_KINGDOM
        candidate_languages = get_country_languages(country).items()

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {i: self.field_map[k] for i, k in enumerate(headers) if k in self.field_map}

        for row in reader:
            components = {}

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value:
                    continue

                if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
                    continue

                value = value.strip(', -')

                validator = self.component_validators.get(key, None)

                if validator is not None and not validator(value):
                    continue

                if value:
                    components[key] = value

            if components:
                components = self.fix_component_encodings(components)

                language = AddressComponents.address_language(components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)
                    if AddressComponents.street_name_is_valid(street):

                        street = abbreviate(street_types_gazetteer, street, language)
                        components[AddressFormatter.ROAD] = street
                    else:
                        components.pop(AddressFormatter.ROAD)
                        street = None

                house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
                if house_number:
                    house_number = self.cleanup_number(house_number, strip_commas=True)

                    if house_number is not None:
                        components[AddressFormatter.HOUSE_NUMBER] = house_number

                postcode = components.get(AddressFormatter.POSTCODE, None)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if not street or (street and house_number and (street.lower() == house_number.lower())):
                    if not postcode:
                        continue
                    components = AddressComponents.drop_address(components)

                country_name = AddressComponents.cldr_country_name(country, language)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(toponym_abbreviations_gazetteer, component, language)
                        component = AddressComponents.name_hyphens(component)
                        components[component_key] = component

                AddressComponents.replace_names(components)

                AddressComponents.prune_duplicate_names(components)

                AddressComponents.remove_numeric_boundary_names(components)
                AddressComponents.add_house_number_phrase(components, language, country=country)

                # Component dropout
                components = place_config.dropout_components(components, country=country)

                formatted = self.formatter.format_address(components, country, language=language,
                                                          minimal_only=False, tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < self.address_only_probability and street:
                    address_only_components = AddressComponents.drop_places(components)
                    address_only_components = AddressComponents.drop_postcode(address_only_components)
                    formatted = self.formatter.format_address(address_only_components, country, language=language,
                                                              minimal_only=False, tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < self.drop_address_probability:
                    components = AddressComponents.drop_address(components)

                    if rand_val < self.drop_address_and_postcode_probability:
                        components = AddressComponents.drop_postcode(components)

                    if components and (len(components) > 1):
                        formatted = self.formatter.format_address(components, country, language=language,
                                                                  minimal_only=False, tag_components=tag_components)
                        yield (language, country, formatted)
Esempio n. 8
0
    def formatted_addresses(self, path, tag_components=True):
        country = Countries.UNITED_KINGDOM
        candidate_languages = get_country_languages(country).items()

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {
            i: self.field_map[k]
            for i, k in enumerate(headers) if k in self.field_map
        }

        for row in reader:
            components = {}

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value:
                    continue

                if not_applicable_regex.match(value) or null_regex.match(
                        value) or unknown_regex.match(value):
                    continue

                value = value.strip(', -')

                validator = self.component_validators.get(key, None)

                if validator is not None and not validator(value):
                    continue

                if value:
                    components[key] = value

            if components:
                components = self.fix_component_encodings(components)

                language = AddressComponents.address_language(
                    components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)
                    if AddressComponents.street_name_is_valid(street):

                        street = abbreviate(street_types_gazetteer, street,
                                            language)
                        components[AddressFormatter.ROAD] = street
                    else:
                        components.pop(AddressFormatter.ROAD)
                        street = None

                house_number = components.get(AddressFormatter.HOUSE_NUMBER,
                                              None)
                if house_number:
                    house_number = self.cleanup_number(house_number,
                                                       strip_commas=True)

                    if house_number is not None:
                        components[
                            AddressFormatter.HOUSE_NUMBER] = house_number

                postcode = components.get(AddressFormatter.POSTCODE, None)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if not street or (street and house_number and
                                  (street.lower() == house_number.lower())):
                    if not postcode:
                        continue
                    components = AddressComponents.drop_address(components)

                country_name = AddressComponents.cldr_country_name(
                    country, language)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(toponym_abbreviations_gazetteer,
                                               component, language)
                        component = AddressComponents.name_hyphens(component)
                        components[component_key] = component

                AddressComponents.replace_names(components)

                AddressComponents.prune_duplicate_names(components)

                AddressComponents.remove_numeric_boundary_names(components)
                AddressComponents.add_house_number_phrase(components,
                                                          language,
                                                          country=country)

                # Component dropout
                components = place_config.dropout_components(components,
                                                             country=country)

                formatted = self.formatter.format_address(
                    components,
                    country,
                    language=language,
                    minimal_only=False,
                    tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < self.address_only_probability and street:
                    address_only_components = AddressComponents.drop_places(
                        components)
                    address_only_components = AddressComponents.drop_postcode(
                        address_only_components)
                    formatted = self.formatter.format_address(
                        address_only_components,
                        country,
                        language=language,
                        minimal_only=False,
                        tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < self.drop_address_probability:
                    components = AddressComponents.drop_address(components)

                    if rand_val < self.drop_address_and_postcode_probability:
                        components = AddressComponents.drop_postcode(
                            components)

                    if components and (len(components) > 1):
                        formatted = self.formatter.format_address(
                            components,
                            country,
                            language=language,
                            minimal_only=False,
                            tag_components=tag_components)
                        yield (language, country, formatted)