Beispiel #1
0
    def name(self, country, language, component, name):
        all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])

        prefixes, prefix_probs = self.prefixes.get((language, component), (None, None))
        suffixes, suffix_probs = self.suffixes.get((language, component), (None, None))

        if not all_replacements and not prefixes and not suffixes:
            return name

        for regex, group, prob in all_replacements:
            match = regex.match(name)
            if match and random.random() < prob:
                name = match.group(group)

        for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
                                                              (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
            if affixes is not None:
                regex = regexes[language, component]
                if regex.match(name):
                    continue

                affix = weighted_choice(affixes, affix_probs)

                if affix is not None:
                    whitespace = affix.get('whitespace', True)
                    space_val = six.u(' ') if whitespace else six.u('')
                    affix = affix[key]
                    if direction == 0:
                        return six.u('{}{}{}').format(affix, space_val, safe_decode(name))
                    else:
                        return six.u('{}{}{}').format(safe_decode(name), space_val, affix)

        return name
Beispiel #2
0
    def join(cls, phrases, language, country=None):

        if not hasattr(phrases, '__iter__'):
            raise ValueError('Param phrases must be iterable')

        values, probs = address_config.alternative_probabilities(
            cls.key, language, country=country)
        phrase, props = weighted_choice(values, probs)

        whitespace = props.get('whitespace', True)
        whitespace_phrase = six.u(' ') if whitespace else six.u('')

        phrases = [safe_decode(p) for p in phrases]

        max_phrase_join = props.get('max_phrase_join', 2)
        if len(phrases) > max_phrase_join:
            default_join = safe_decode(
                props.get(
                    'default_join', cls.DEFAULT_WHITESPACE_JOIN
                    if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
            prefix = default_join.join(phrases[:-max_phrase_join] +
                                       [six.u('')])
        else:
            prefix = six.u('')

        if whitespace:
            phrase = six.u('{}{}{}').format(whitespace_phrase, phrase,
                                            whitespace_phrase)
        joined_phrase = phrase.join(phrases[-max_phrase_join:])

        return six.u('').join([prefix, joined_phrase])
Beispiel #3
0
    def phrase(cls, unit, language, country=None, zone=None):
        if unit is not None:
            key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)

            if not address_config.get_property(key, language, country=country):
                return None

            is_alpha = safe_decode(unit).isalpha()

            direction_unit = None
            add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
            if add_direction:
                direction_unit = cls.add_direction(key, unit, language, country=country)

            if direction_unit and direction_unit != unit:
                unit = direction_unit
                is_alpha = False
            else:
                add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
                if add_quadrant:
                    unit = cls.add_quadrant(key, unit, language, country=country)
                    is_alpha = False

            return cls.numeric_phrase(key, safe_decode(unit), language,
                                      dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
        else:
            key = 'units.standalone'
            values, probs = address_config.alternative_probabilities(key, language,
                                                                     dictionaries=['unit_types_standalone'],
                                                                     country=country)
            if values is None:
                return None
            phrase, phrase_props = weighted_choice(values, probs)
            return phrase.title()
Beispiel #4
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.block_range, cls.block_range_cdf)
            return safe_decode(number)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.block_range, cls.block_range_cdf)

                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
Beispiel #5
0
    def join(cls, phrases, language, country=None):

        if not hasattr(phrases, '__iter__'):
            raise ValueError('Param phrases must be iterable')

        values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
        phrase, props = weighted_choice(values, probs)

        whitespace = props.get('whitespace', True)
        whitespace_phrase = six.u(' ') if whitespace else six.u('')

        phrases = [safe_decode(p) for p in phrases]

        max_phrase_join = props.get('max_phrase_join', 2)
        if len(phrases) > max_phrase_join:
            default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
            prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
        else:
            prefix = six.u('')

        if whitespace:
            phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
        joined_phrase = phrase.join(phrases[-max_phrase_join:])

        return six.u('').join([prefix, joined_phrase])
Beispiel #6
0
def latlon_to_decimal(latitude, longitude):
    have_lat = False
    have_lon = False

    latitude = safe_decode(latitude).strip(u' ,;|')
    longitude = safe_decode(longitude).strip(u' ,;|')

    latitude = latitude.replace(u',', u'.')
    longitude = longitude.replace(u',', u'.')

    lat_dms = latitude_dms_regex.match(latitude)
    lat_dir = latitude_decimal_with_direction_regex.match(latitude)

    if lat_dms:
        d, m, s, c = lat_dms.groups()
        sign = direction_sign(c)
        latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
        have_lat = True
    elif lat_dir:
        d, c = lat_dir.groups()
        sign = direction_sign(c)
        latitude = return_type(d) * sign
        have_lat = True
    else:
        latitude = re.sub(beginning_re, u'', latitude)
        latitude = re.sub(end_re, u'', latitude)

    lon_dms = longitude_dms_regex.match(longitude)
    lon_dir = longitude_decimal_with_direction_regex.match(longitude)

    if lon_dms:
        d, m, s, c = lon_dms.groups()
        sign = direction_sign(c)
        longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
        have_lon = True
    elif lon_dir:
        d, c = lon_dir.groups()
        sign = direction_sign(c)
        longitude = return_type(d) * sign
        have_lon = True
    else:
        longitude = re.sub(beginning_re, u'', longitude)
        longitude = re.sub(end_re, u'', longitude)

    latitude = float(latitude)
    longitude = float(longitude)

    if not is_valid_latitude(latitude):
        raise ValueError('Invalid latitude: {}'.format(latitude))

    if not is_valid_longitude(longitude):
        raise ValueError('Invalid longitude: {}'.format(longitude))

    latitude = to_valid_latitude(latitude)
    longitude = to_valid_longitude(longitude)

    return latitude, longitude
Beispiel #7
0
def latlon_to_decimal(latitude, longitude):
    have_lat = False
    have_lon = False

    latitude = safe_decode(latitude).strip(u' ,;|')
    longitude = safe_decode(longitude).strip(u' ,;|')

    latitude = latitude.replace(u',', u'.')
    longitude = longitude.replace(u',', u'.')

    lat_dms = latitude_dms_regex.match(latitude)
    lat_dir = latitude_decimal_with_direction_regex.match(latitude)

    if lat_dms:
        d, m, s, c = lat_dms.groups()
        sign = direction_sign(c)
        latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
        have_lat = True
    elif lat_dir:
        d, c = lat_dir.groups()
        sign = direction_sign(c)
        latitude = return_type(d) * sign
        have_lat = True
    else:
        latitude = re.sub(beginning_re, u'', latitude)
        latitude = re.sub(end_re, u'', latitude)

    lon_dms = longitude_dms_regex.match(longitude)
    lon_dir = longitude_decimal_with_direction_regex.match(longitude)

    if lon_dms:
        d, m, s, c = lon_dms.groups()
        sign = direction_sign(c)
        longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
        have_lon = True
    elif lon_dir:
        d, c = lon_dir.groups()
        sign = direction_sign(c)
        longitude = return_type(d) * sign
        have_lon = True
    else:
        longitude = re.sub(beginning_re, u'', longitude)
        longitude = re.sub(end_re, u'', longitude)

    latitude = float(latitude)
    longitude = float(longitude)

    if not is_valid_latitude(latitude):
        raise ValueError('Invalid latitude: {}'.format(latitude))

    if not is_valid_longitude(longitude):
        raise ValueError('Invalid longitude: {}'.format(longitude))

    latitude = to_valid_latitude(latitude)
    longitude = to_valid_longitude(longitude)

    return latitude, longitude
Beispiel #8
0
    def phrase(cls, number, language, country=None):
        if number is None:
            return number

        key = 'conscription_numbers.alphanumeric'
        dictionaries = ['house_numbers']
        default = safe_decode(number)

        return cls.numeric_phrase(key, safe_decode(number), language,
                                  dictionaries=dictionaries, country=country)
Beispiel #9
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'staircases.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.staircase_range,
                                     cls.staircase_range_cdf)
            return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number = weighted_choice(cls.staircase_range,
                                     cls.staircase_range_cdf)
            number2 = number + weighted_choice(cls.staircase_range,
                                               cls.staircase_range_cdf)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.staircase_range,
                                         cls.staircase_range_cdf)

                whitespace_probability = float(
                    num_type_props.get('whitespace_probability', 0.0))
                hyphen_probability = float(
                    num_type_props.get('hyphen_probability', 0.0))
                whitespace_phrase = u''
                r = random.random()
                if r < whitespace_probability:
                    whitespace_phrase = u' '
                elif r < (whitespace_probability + hyphen_probability):
                    whitespace_phrase = u'-'

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase,
                                                  number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase,
                                                  letter)
    def create_from_osm_file(cls, filename, output_dir, precision=None):
        '''
        Given an OSM file (planet or some other bounds) containing relations
        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other bounds.
        '''
        if precision is None:
            precision = cls.GEOHASH_PRECISION

        index = cls(save_dir=output_dir, precision=precision)

        i = 0
        for element_id, props, deps in parse_osm(filename):
            props = {
                safe_decode(k): safe_decode(v)
                for k, v in six.iteritems(props)
            }

            node_id = long(element_id.split(':')[-1])
            lat = props.get('lat')
            lon = props.get('lon')
            if lat is None or lon is None:
                continue
            lat, lon = latlon_to_decimal(lat, lon)
            if lat is None or lon is None:
                continue

            if isclose(lon, 180.0):
                lon = 179.999

            props = {
                k: v
                for k, v in six.iteritems(props)
                if k in ('id', 'type') or k in cls.include_property_patterns or
                (six.u(':') in k and six.u('{}:*').format(
                    k.split(six.u(':'), 1)[0]) in cls.include_property_patterns
                 )
            }

            props['type'] = 'node'
            props['id'] = node_id

            index.add_point(lat, lon, props)

            if i % 1000 == 0 and i > 0:
                print('did {} points'.format(i))
            i += 1

        return index
Beispiel #11
0
def parse_osm_number_range(value, parse_letter_range=True, max_range=100):
    value = normalize_string(value,
                             string_options=NORMALIZE_STRING_LATIN_ASCII
                             | NORMALIZE_STRING_DECOMPOSE)
    numbers = []
    values = number_split_regex.split(value)
    for val in values:
        val = val.strip()
        match = number_range_regex.match(val)
        if match:
            start_num, end_num = match.groups()
            start_num_len = len(start_num)

            zfill = 0
            if start_num.startswith('0'):
                zfill = start_num_len

            try:
                start_num = int(start_num)
                end_num = int(end_num)

                if end_num > start_num:
                    if end_num - start_num > max_range:
                        end_num = start_num + max_range

                    for i in xrange(start_num, end_num + 1):
                        numbers.append(safe_decode(i).zfill(zfill))
                else:
                    numbers.append(val.strip().zfill(zfill))
                    continue
            except (TypeError, ValueError):
                numbers.append(safe_decode(val).strip().zfill(zfill))
                continue

        else:
            letter_match = letter_range_regex.match(val)
            if letter_match and parse_letter_range:
                start_num, end_num = letter_match.groups()
                start_num = ord(start_num)
                end_num = ord(end_num)
                if end_num > start_num:
                    if end_num - start_num > max_range:
                        end_num = start_num + max_range
                    for i in xrange(start_num, end_num + 1):
                        numbers.append(six.unichr(i))
                else:
                    numbers.extend(
                        [six.unichr(start_num),
                         six.unichr(end_num)])
                    continue
            else:
                numbers.append(safe_decode(val.strip()))
    return numbers
Beispiel #12
0
    def random_from_int(cls, number, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'levels.alphanumeric', language, country=country)
        if num_type is None:
            return None

        numbering_starts_at = int(
            address_config.get_property('levels.numbering_starts_at',
                                        language,
                                        country=country,
                                        default=0))

        if number >= 0:
            number += numbering_starts_at

        if num_type == cls.NUMERIC:
            return safe_decode(number)
        elif num_type == cls.ROMAN_NUMERAL:
            roman_numeral = numeric_expressions.roman_numeral(number)
            if roman_numeral is not None:
                return roman_numeral
            else:
                return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number2 = number + sample_floors_range(1, cls.max_floors)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet)
            if num_type == cls.ALPHA:
                return letter
            else:
                number = weighted_choice(cls.floors_letters,
                                         cls.floors_letters_cdf)

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}').format(letter, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}').format(number, letter)

        return None
Beispiel #13
0
    def phrase(cls, unit, language, country=None, zone=None):
        if unit is not None:
            key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(
                zone)

            if not address_config.get_property(key, language, country=country):
                return None

            is_alpha = safe_decode(unit).isalpha()

            direction_unit = None
            add_direction = address_config.get_property(
                '{}.add_direction'.format(key), language, country=country)
            if add_direction:
                direction_unit = cls.add_direction(key,
                                                   unit,
                                                   language,
                                                   country=country)

            if direction_unit and direction_unit != unit:
                unit = direction_unit
                is_alpha = False
            else:
                add_quadrant = address_config.get_property(
                    '{}.add_quadrant'.format(key), language, country=country)
                if add_quadrant:
                    unit = cls.add_quadrant(key,
                                            unit,
                                            language,
                                            country=country)
                    is_alpha = False

            return cls.numeric_phrase(key,
                                      safe_decode(unit),
                                      language,
                                      dictionaries=['unit_types_numbered'],
                                      country=country,
                                      is_alpha=is_alpha)
        else:
            key = 'units.standalone'
            values, probs = address_config.alternative_probabilities(
                key,
                language,
                dictionaries=['unit_types_standalone'],
                country=country)
            if values is None:
                return None
            phrase, phrase_props = weighted_choice(values, probs)
            return phrase.title()
Beispiel #14
0
def parse_osm_number_range(value, parse_letter_range=True, max_range=100):
    value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE)
    numbers = []
    values = number_split_regex.split(value)
    for val in values:
        val = val.strip()
        match = number_range_regex.match(val)
        if match:
            start_num, end_num = match.groups()
            start_num_len = len(start_num)

            zfill = 0
            if start_num.startswith('0'):
                zfill = start_num_len

            try:
                start_num = int(start_num)
                end_num = int(end_num)

                if end_num > start_num:
                    if end_num - start_num > max_range:
                        end_num = start_num + max_range

                    for i in xrange(start_num, end_num + 1):
                        numbers.append(safe_decode(i).zfill(zfill))
                else:
                    numbers.append(val.strip().zfill(zfill))
                    continue
            except (TypeError, ValueError):
                numbers.append(safe_decode(val).strip().zfill(zfill))
                continue

        else:
            letter_match = letter_range_regex.match(val)
            if letter_match and parse_letter_range:
                start_num, end_num = letter_match.groups()
                start_num = ord(start_num)
                end_num = ord(end_num)
                if end_num > start_num:
                    if end_num - start_num > max_range:
                        end_num = start_num + max_range
                    for i in xrange(start_num, end_num + 1):
                        numbers.append(six.unichr(i))
                else:
                    numbers.extend([six.unichr(start_num), six.unichr(end_num)])
                    continue
            else:
                numbers.append(safe_decode(val.strip()))
    return numbers
Beispiel #15
0
    def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):

        if num_type == cls.NUMERIC_AFFIX:
            phrase = props['affix']
            if 'zero_pad' in props and number.isdigit():
                number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)
        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        if props.get('title_case', True):
            # Title case unless the config specifies otherwise
            phrase = phrase.title()

        if number is None:
            return phrase

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(number)
Beispiel #16
0
 def for_floor(cls, floor_number, num_digits=None):
     num_digits = num_digits if num_digits is not None else cls.sample_num_digits(
     )
     unit = weighted_choice(cls.positive_units_floors,
                            cls.positive_units_floors_cdf)
     return six.u('{}{}').format(floor_number,
                                 safe_decode(unit).zfill(num_digits))
Beispiel #17
0
    def pick_phrase_and_type(cls, number, language, country=None):
        values, probs = address_config.alternative_probabilities(
            cls.key, language, dictionaries=cls.dictionaries, country=country)
        if not values:
            return None, safe_decode(
                number) if number is not None else None, None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key, None)
            if prob is not None:
                values.append(num_type)
                probs.append(prob)

        if not probs:
            num_type = cls.NUMERIC
        else:
            probs = cdf(probs)
            num_type = weighted_choice(values, probs)

        return num_type, phrase, phrase_props[num_type]
Beispiel #18
0
def normalized_tokens(s,
                      string_options=DEFAULT_STRING_OPTIONS,
                      token_options=DEFAULT_TOKEN_OPTIONS,
                      strip_parentheticals=True,
                      whitespace=False):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.

    This version only uses libpostal's deterministic normalizations
    i.e. methods with a single output. The string tree version will
    return multiple normalized strings, each with tokens.

    Usage:
        normalized_tokens(u'St.-Barthélemy')
    '''
    s = safe_decode(s)
    normalized_tokens = _normalize.normalized_tokens(s, string_options,
                                                     token_options, whitespace)

    if strip_parentheticals:
        normalized_tokens = remove_parens(normalized_tokens)

    return [(s, token_types.from_id(token_type))
            for s, token_type in normalized_tokens]
Beispiel #19
0
    def remove_components(self, template, tags):
        new_components = []
        tags = set(tags)

        parsed = pystache.parse(safe_decode(template))

        last_removed = False
        for i, el in enumerate(parsed._parse_tree):
            if hasattr(el, 'parsed'):
                keys = [
                    e.key for e in el.parsed._parse_tree
                    if hasattr(e, 'key') and e.key not in tags
                ]
                if keys:
                    new_components.append(self.build_first_of_template(keys))
                    last_removed = False
                else:
                    last_removed = True
            elif hasattr(el, 'key'):
                if el.key not in tags:
                    new_components.append(
                        '{{{{{{{key}}}}}}}'.format(key=el.key))
                    last_removed = False
                else:
                    last_removed = True

            elif not last_removed:
                new_components.append(el)
            else:
                last_removed = False
        return ''.join(new_components).strip()
Beispiel #20
0
 def search_suffix(self, token):
     suffix_search, suffix_len = self.search_substring(
         safe_decode(token[::-1]))
     if suffix_search:
         return suffix_search[0].split('|')
     else:
         return None
Beispiel #21
0
    def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
        self.lexicon = lexicon

        regexes, responses = zip(*lexicon)

        self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
        self.responses = responses
Beispiel #22
0
    def remove_components(self, template, tags):
        new_components = []
        tags = set(tags)

        parsed = pystache.parse(safe_decode(template))

        last_removed = False
        for i, el in enumerate(parsed._parse_tree):
            if hasattr(el, 'parsed'):
                keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key') and e.key not in tags]
                if keys:
                    new_components.append(self.build_first_of_template(keys))
                    last_removed = False
                else:
                    last_removed = True
            elif hasattr(el, 'key'):
                if el.key not in tags:
                    new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
                    last_removed = False
                else:
                    last_removed = True

            elif not last_removed:
                new_components.append(el)
            else:
                last_removed = False
        return ''.join(new_components).strip()
Beispiel #23
0
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
    s = safe_decode(s)
    if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
        normalized = _normalize.normalize_string_latin(s, string_options)
    else:
        normalized = _normalize.normalize_string_utf8(s, string_options)

    return normalized
Beispiel #24
0
    def cldr_country_names(self, language):
        '''
        Country names are tricky as there can be several versions
        and levels of verbosity e.g. United States of America
        vs. the more commonly used United States. Most countries
        have a similarly verbose form.

        The CLDR repo (http://cldr.unicode.org/) has the most
        comprehensive localized database of country names
        (among other things), organized by language. This function
        parses CLDR XML for a given language and returns a dictionary
        of {country_code: name} for that language.
        '''
        filename = os.path.join(self.base_dir, '{}.xml'.format(language))
        xml = etree.parse(open(filename))

        country_names = defaultdict(dict)

        for territory in xml.xpath('*//territories/*'):
            country_code = territory.attrib['type']

            if country_code in IGNORE_COUNTRIES or country_code.isdigit():
                continue

            country_names[country_code][territory.attrib.get(
                'alt')] = safe_decode(territory.text)

        display_names = {}

        for country_code, names in country_names.iteritems():
            if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
                display_names[country_code] = safe_decode(
                    LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
                continue

            default_name = names.get(None)

            if country_code in COUNTRY_USE_SHORT_NAME:
                display_names[country_code] = names.get('short', default_name)
            elif country_code in COUNTRY_USE_VARIANT_NAME:
                display_names[country_code] = names.get(
                    'variant', default_name)
            elif default_name is not None:
                display_names[country_code] = default_name

        return display_names
Beispiel #25
0
    def phrase(cls, number, language, country=None):
        if number is not None:
            prob_key = 'house_numbers.alphanumeric_phrase_probability'
            key = 'house_numbers.alphanumeric'
            dictionaries = ['house_numbers', 'number']
            default = safe_decode(number)
        else:
            prob_key = 'house_numbers.no_number_probability'
            key = 'house_numbers.no_number'
            dictionaries = ['no_number']
            default = None

        phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0)
        if random.random() < phrase_prob:
            return cls.numeric_phrase(key, safe_decode(number), language,
                                      dictionaries=dictionaries, country=country)
        return default
Beispiel #26
0
 def phrase(cls, box_number, language, country=None):
     if box_number is None:
         return None
     return cls.numeric_phrase('po_boxes.alphanumeric',
                               safe_decode(box_number),
                               language,
                               dictionaries=['post_office'],
                               country=country)
def download_pre_release_downloads(out_dir):
    for url in openaddresses_config.config.get('pre_release_downloads', []):
        print(six.u('doing pre_release {}').format(safe_decode(url)))

        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))
            return False
    return True
def download_pre_release_downloads(out_dir):
    for url in openaddresses_config.config.get('pre_release_downloads', []):
        print(six.u('doing pre_release {}').format(safe_decode(url)))

        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))
            return False
    return True
Beispiel #29
0
def cldr_country_names(language, base_dir=CLDR_MAIN_PATH):
    """
    Country names are tricky as there can be several versions
    and levels of verbosity e.g. United States of America
    vs. the more commonly used United States. Most countries
    have a similarly verbose form.

    The CLDR repo (http://cldr.unicode.org/) has the most
    comprehensive localized database of country names
    (among other things), organized by language. This function
    parses CLDR XML for a given language and returns a dictionary
    of {country_code: name} for that language.
    """
    filename = os.path.join(base_dir, "{}.xml".format(language))
    xml = etree.parse(open(filename))

    country_names = defaultdict(dict)

    for territory in xml.xpath("*//territories/*"):
        country_code = territory.attrib["type"]

        if country_code in IGNORE_COUNTRIES or country_code.isdigit():
            continue

        country_names[country_code][territory.attrib.get("alt")] = safe_decode(territory.text)

    display_names = {}

    for country_code, names in country_names.iteritems():
        if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
            display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
            continue

        default_name = names.get(None)

        if country_code in COUNTRY_USE_SHORT_NAME:
            display_names[country_code] = names.get("short", default_name)
        elif country_code in COUNTRY_USE_VARIANT_NAME:
            display_names[country_code] = names.get("variant", default_name)
        elif default_name is not None:
            display_names[country_code] = default_name

    return display_names
Beispiel #30
0
def normalize_wikipedia_title(title):
    match = apposition_regex.match(title)
    if match:
        title = match.group(1)

    title = safe_decode(title)
    title = html_parser.unescape(title)
    title = urllib.unquote_plus(title)

    return title.replace(u'_', u' ').strip()
Beispiel #31
0
def normalize_wikipedia_title(title):
    match = apposition_regex.match(title)
    if match:
        title = match.group(1)

    title = safe_decode(title)
    title = html_parser.unescape(title)
    title = urllib.unquote_plus(title)

    return title.replace(u'_', u' ').strip()
Beispiel #32
0
    def phrase(cls, language, key, value, is_plural=False, country=None):
        category_phrase = category_config.get_phrase(language,
                                                     key,
                                                     value,
                                                     is_plural=is_plural)
        if not category_phrase:
            return NULL_CATEGORY_QUERY

        category_phrase = safe_decode(category_phrase)

        prep_phrase_type = CategoryPreposition.random(language,
                                                      country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return CategoryQuery(category_phrase,
                                 prep=None,
                                 add_place_name=True,
                                 add_address=True)

        values, probs = address_config.alternative_probabilities(
            'categories.{}'.format(prep_phrase_type),
            language,
            country=country)
        if not values:
            return CategoryQuery(category_phrase,
                                 prep=None,
                                 add_place_name=True,
                                 add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                               CategoryPreposition.NEAR_ME,
                                               CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                                  CategoryPreposition.NEAR_ME)

        return CategoryQuery(category_phrase,
                             prep=prep_phrase,
                             add_place_name=add_place_name,
                             add_address=add_address)
Beispiel #33
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type(
            'blocks.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.block_range, cls.block_range_cdf)
            return safe_decode(number)
        else:
            alphabet = address_config.get_property('alphabet',
                                                   language,
                                                   country=country,
                                                   default=latin_alphabet)
            alphabet_probability = address_config.get_property(
                'alphabet_probability',
                language,
                country=country,
                default=None)
            if alphabet_probability is not None and random.random(
            ) >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.block_range, cls.block_range_cdf)

                whitespace_probability = float(
                    num_type_props.get('whitespace_probability', 0.0))
                whitespace_phrase = six.u(
                    ' ') if whitespace_probability and random.random(
                    ) < whitespace_probability else six.u('')

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase,
                                                  number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase,
                                                  letter)
Beispiel #34
0
    def add_affixes(self, lang, *confs):
        prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
        prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]

        self.language_prefixes[lang] = prefixes + prefixes_no_whitespace

        suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
        suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]

        self.language_suffixes[lang] = suffixes + suffixes_no_whitespace

        whitespace_phrase = six.u('[ \-]')

        all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
        all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace

        if all_prefixes:
            prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
            self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)

        if all_suffixes:
            suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
            self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)

        sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
        if sim_only_prefixes:
            sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
            self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)

        sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
        if sim_only_suffixes:
            sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))

            self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
def get_script_codes(all_scripts):

    if not os.path.exists(LOCAL_ISO_15924_FILE):
        temp_dir = tempfile.gettempdir()

        script_codes_filename = os.path.join(temp_dir,
                                             ISO_15924_URL.rsplit('/')[-1])

        # This comes as a .zip
        script_codes_response = requests.get(ISO_15924_URL)
        zf = ZipFile(StringIO(script_codes_response.content))
        iso15924_filename = [
            name for name in zf.namelist() if name.startswith('iso15924')
        ][0]

        # Strip out the comments, etc.
        temp_iso15924_file = u'\n'.join([
            line.rstrip()
            for line in safe_decode(zf.read(iso15924_filename)).split('\n')
            if line.strip() and not line.strip().startswith('#')
        ])

        f = open(LOCAL_ISO_15924_FILE, 'w')
        f.write(safe_encode(temp_iso15924_file))
        f.close()

    script_codes_file = open(LOCAL_ISO_15924_FILE)

    script_codes = {}
    seen_scripts = set()

    # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
    for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
        if name in all_scripts:
            script_codes[code] = name
            seen_scripts.add(name)
        else:
            normalized_name = name.split('(')[0].strip()
            if normalized_name in all_scripts and normalized_name not in seen_scripts:
                script_codes[code] = normalized_name
                seen_scripts.add(normalized_name)

    value_aliases = get_property_value_aliases()
    script_aliases = value_aliases['sc']

    for code, script in script_aliases.iteritems():
        if code not in script_codes and script in all_scripts:
            script_codes[code] = script

    script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)

    return script_codes
Beispiel #36
0
    def name(self, country, language, component, name):
        all_replacements = self.country_regex_replacements.get(
            country, []) + self.country_regex_replacements.get(None, [])

        prefixes, prefix_probs = self.prefixes.get((language, component),
                                                   (None, None))
        suffixes, suffix_probs = self.suffixes.get((language, component),
                                                   (None, None))

        if not all_replacements and not prefixes and not suffixes:
            return name

        for regex, group, prob in all_replacements:
            match = regex.match(name)
            if match and random.random() < prob:
                name = match.group(group)

        for affixes, affix_probs, regexes, key, direction in (
            (prefixes, prefix_probs, self.prefix_regexes, 'prefix',
             0), (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
            if affixes is not None:
                regex = regexes[language, component]
                if regex.match(name):
                    continue

                affix = weighted_choice(affixes, affix_probs)

                if affix is not None:
                    whitespace = affix.get('whitespace', True)
                    space_val = six.u(' ') if whitespace else six.u('')
                    affix = affix[key]
                    if direction == 0:
                        return six.u('{}{}{}').format(affix, space_val,
                                                      safe_decode(name))
                    else:
                        return six.u('{}{}{}').format(safe_decode(name),
                                                      space_val, affix)

        return name
Beispiel #37
0
    def phrase(cls, chain, language, country=None):
        if not chain:
            return NULL_CHAIN_QUERY

        chain_phrase = safe_decode(chain)

        prep_phrase_type = CategoryPreposition.random(language, country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)

        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
        if not values:
            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)

        return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
Beispiel #38
0
    def __init__(self, base_dir=STATE_DIR):
        self.full_names = {}
        self.abbreviations = {}

        for filename in os.listdir(base_dir):
            country = filename.split('.yaml')[0]
            country_config = yaml.load(open(os.path.join(base_dir, filename)))

            country_abbreviations = defaultdict(list)
            country_full_names = defaultdict(dict)

            for abbreviation, vals in six.iteritems(country_config):
                for language, full_name in six.iteritems(vals):
                    full_name = safe_decode(full_name)
                    abbreviation = safe_decode(abbreviation)
                    country_abbreviations[(full_name.lower(),
                                           language)].append(abbreviation)
                    country_full_names[
                        abbreviation.lower()][language] = full_name

            self.abbreviations[country] = dict(country_abbreviations)
            self.full_names[country] = dict(country_full_names)
Beispiel #39
0
    def scan(self, s):

        for match in self.regex.finditer(safe_decode(s)):
            i = match.lastindex
            response = self.responses[i - 1]
            token = match.group(i)
            if not callable(response):
                yield (token, response)
            else:
                responses = response(match, token)
                if responses is not None:
                    for response, token in responses:
                        yield (token, response)
Beispiel #40
0
    def random_from_int(cls, number, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
        if num_type is None:
            return None

        numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))

        if number >= 0:
            number += numbering_starts_at

        if num_type == cls.NUMERIC:
            return safe_decode(number)
        elif num_type == cls.ROMAN_NUMERAL:
            roman_numeral = numeric_expressions.roman_numeral(number)
            if roman_numeral is not None:
                return roman_numeral
            else:
                return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number2 = number + sample_floors_range(1, cls.max_floors)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet)
            if num_type == cls.ALPHA:
                return letter
            else:
                number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}').format(letter, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}').format(number, letter)

        return None
Beispiel #41
0
    def phrase(cls, chain, language, country=None):
        if not chain:
            return NULL_CHAIN_QUERY

        chain_phrase = safe_decode(chain)

        prep_phrase_type = CategoryPreposition.random(language,
                                                      country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return ChainQuery(chain_phrase,
                              prep=None,
                              add_place_name=True,
                              add_address=True)

        values, probs = address_config.alternative_probabilities(
            'categories.{}'.format(prep_phrase_type),
            language,
            country=country)
        if not values:
            return ChainQuery(chain_phrase,
                              prep=None,
                              add_place_name=True,
                              add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                               CategoryPreposition.NEAR_ME,
                                               CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY,
                                                  CategoryPreposition.NEAR_ME)

        return ChainQuery(chain_phrase,
                          prep=prep_phrase,
                          add_place_name=add_place_name,
                          add_address=add_address)
def read_dictionary_file(path):
    for i, line in enumerate(open(path)):
        line = safe_decode(line.rstrip())
        if not line.strip():
            continue

        if u'}' in line:
            raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
        phrases = line.split(u'|')

        if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
            raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))

        yield phrases
Beispiel #43
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
Beispiel #44
0
    def phrase(cls, language, key, value, is_plural=False, country=None):
        category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
        if not category_phrase:
            return NULL_CATEGORY_QUERY

        category_phrase = safe_decode(category_phrase)

        prep_phrase_type = CategoryPreposition.random(language, country=country)

        if prep_phrase_type in (None, CategoryPreposition.NULL):
            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)

        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
        if not values:
            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)

        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
        prep_phrase = safe_decode(prep_phrase)

        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)

        return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
Beispiel #45
0
    def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
        self.base_dir = base_dir
        self.languages = []

        self.language_dictionaries = defaultdict(list)
        self.phrases = defaultdict(list)

        for language in os.listdir(base_dir):
            language_dir = os.path.join(base_dir, language)
            if not os.path.isdir(language_dir):
                continue

            self.languages.append(language)

            for filename in os.listdir(language_dir):
                if not filename.endswith('.txt'):
                    raise InvalidAddressFileException(
                        u'Invalid extension for file {}/{}, must be .txt'.
                        format(language_dir, filename))
                dictionary_name = filename.split('.')[0].lower()

                if dictionary_name not in gazetteer_types:
                    raise InvalidAddressFileException(
                        u'Invalid filename for file {}/{}. Must be one of {{{}}}'
                        .format(language_dir, filename,
                                ', '.join(sorted(gazetteer_types))))
                self.language_dictionaries[language].append(dictionary_name)

                path = os.path.join(language_dir, filename)
                for i, line in enumerate(open(path)):
                    line = safe_decode(line.rstrip())
                    if not line.strip():
                        continue

                    if u'}' in line:
                        raise InvalidAddressFileException(
                            u'Found }} in file: {}, line {}'.format(
                                path, i + 1))
                    phrases = line.split(u'|')

                    if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
                        raise InvalidAddressFileException(
                            u'Found blank synonym in: {}, line {}'.format(
                                path, i + 1))

                    self.phrases[(language, dictionary_name)].append(phrases)

        self.language_dictionaries = dict(self.language_dictionaries)
        self.phrases = dict(self.phrases)
def scrape_nominatim_category_page(url, ignore_plurals=False):
    result = requests.get(url)

    if not result or not result.content:
        return

    for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
        if operator and operator != '-':
            continue

        is_plural = plural == 'Y'
        if is_plural and ignore_plurals:
            continue

        yield safe_decode(phrase).lower(), key, value, is_plural
Beispiel #47
0
    def random(cls, language, country=None):
        num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
        if num_type is None:
            return None

        if num_type == cls.NUMERIC:
            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
            return safe_decode(number)
        elif num_type == cls.HYPHENATED_NUMBER:
            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
            number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
            return u'{}-{}'.format(number, number2)
        else:
            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
            if alphabet_probability is not None and random.random() >= alphabet_probability:
                alphabet = latin_alphabet
            letter = sample_alphabet(alphabet, 2.0)
            if num_type == cls.ALPHA:
                return safe_decode(letter)
            else:
                number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)

                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
                whitespace_phrase = u''
                r = random.random()
                if r < whitespace_probability:
                    whitespace_phrase = u' '
                elif r < (whitespace_probability + hyphen_probability):
                    whitespace_phrase = u'-'

                if num_type == cls.ALPHA_PLUS_NUMERIC:
                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
                elif num_type == cls.NUMERIC_PLUS_ALPHA:
                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
Beispiel #48
0
def read_dictionary_file(path):
    for i, line in enumerate(open(path)):
        line = safe_decode(line.rstrip())
        if not line.strip():
            continue

        if u'}' in line:
            raise InvalidAddressFileException(
                u'Found }} in file: {}, line {}'.format(path, i + 1))
        phrases = line.split(u'|')

        if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
            raise InvalidAddressFileException(
                u'Found blank synonym in: {}, line {}'.format(path, i + 1))

        yield phrases
def openaddresses_download_configured_files(out_dir):
    for path in openaddresses_config.sources:

        source = six.b('/').join([safe_encode(p) for p in path])
        filename = safe_encode(path[-1]) + six.b('.zip')
        zip_path = filename + '.zip'
        zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)])

        url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)

        download_pre_release_downloads(out_dir)

        print(six.u('doing {}').format(safe_decode(source)))
        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))
def scrape_nominatim_category_page(url, ignore_plurals=False):
    result = requests.get(url)

    if not result or not result.content:
        return

    for phrase, key, value, operator, plural in phrase_table_re.findall(
            result.content):
        if operator and operator != '-':
            continue

        is_plural = plural == 'Y'
        if is_plural and ignore_plurals:
            continue

        yield safe_decode(phrase).lower(), key, value, is_plural
Beispiel #51
0
def get_script_codes(all_scripts):

    if not os.path.exists(LOCAL_ISO_15924_FILE):
        temp_dir = tempfile.gettempdir()

        script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])

        # This comes as a .zip
        script_codes_response = requests.get(ISO_15924_URL)
        zf = ZipFile(StringIO(script_codes_response.content))
        iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]

        # Strip out the comments, etc.
        temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
                                        if line.strip() and not line.strip().startswith('#')])

        f = open(LOCAL_ISO_15924_FILE, 'w')
        f.write(safe_encode(temp_iso15924_file))
        f.close()

    script_codes_file = open(LOCAL_ISO_15924_FILE)

    script_codes = {}
    seen_scripts = set()

    # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
    for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
        if name in all_scripts:
            script_codes[code] = name
            seen_scripts.add(name)
        else:
            normalized_name = name.split('(')[0].strip()
            if normalized_name in all_scripts and normalized_name not in seen_scripts:
                script_codes[code] = normalized_name
                seen_scripts.add(normalized_name)

    value_aliases = get_property_value_aliases()
    script_aliases = value_aliases['sc']

    for code, script in script_aliases.iteritems():
        if code not in script_codes and script in all_scripts:
            script_codes[code] = script

    script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)

    return script_codes
Beispiel #52
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
Beispiel #53
0
    def replace_suffixes(self, name, lang, country=None, sim_only=False):
        name = safe_decode(name).strip()

        if not sim_only or lang not in self.language_suffix_sim_only_regexes:
            d = self.language_suffix_regexes
        else:
            d = self.language_suffix_sim_only_regexes

        re = None
        if country is not None:
            re = d.get((country, lang))
            if re:
                name = re.sub(six.u(''), name)

        re = d.get(lang)

        if not re:
            return name

        return re.sub(six.u(''), name)
Beispiel #54
0
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
                      token_options=DEFAULT_TOKEN_OPTIONS,
                      strip_parentheticals=True, whitespace=False):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.

    This version only uses libpostal's deterministic normalizations
    i.e. methods with a single output. The string tree version will
    return multiple normalized strings, each with tokens.

    Usage:
        normalized_tokens(u'St.-Barthélemy')
    '''
    s = safe_decode(s)
    normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace)

    if strip_parentheticals:
        normalized_tokens = remove_parens(normalized_tokens)

    return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]
Beispiel #55
0
 def cleanup_number(cls, num, strip_commas=False):
     num = num.strip()
     if strip_commas:
         num = num.replace(six.u(','), six.u(''))
     try:
         num_int = int(num)
     except (ValueError, TypeError):
         try:
             num_float = float(num)
             leading_zeros = 0
             for c in num:
                 if c == six.u('0'):
                     leading_zeros += 1
                 else:
                     break
             num = safe_decode(int(num_float))
             if leading_zeros:
                 num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
         except (ValueError, TypeError):
             pass
     return num
    def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
        self.base_dir = base_dir
        self.languages = []

        self.language_dictionaries = defaultdict(list)
        self.phrases = defaultdict(list)

        for language in os.listdir(base_dir):
            language_dir = os.path.join(base_dir, language)
            if not os.path.isdir(language_dir):
                continue

            self.languages.append(language)

            for filename in os.listdir(language_dir):
                if not filename.endswith('.txt'):
                    raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
                dictionary_name = filename.split('.')[0].lower()

                if dictionary_name not in gazetteer_types:
                    raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
                self.language_dictionaries[language].append(dictionary_name)

                path = os.path.join(language_dir, filename)
                for i, line in enumerate(open(path)):
                    line = safe_decode(line.rstrip())
                    if not line.strip():
                        continue

                    if u'}' in line:
                        raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
                    phrases = line.split(u'|')

                    if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
                        raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))

                    self.phrases[(language, dictionary_name)].append(phrases)

        self.language_dictionaries = dict(self.language_dictionaries)
        self.phrases = dict(self.phrases)