def name(self, country, language, component, name): all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, []) prefixes, prefix_probs = self.prefixes.get((language, component), (None, None)) suffixes, suffix_probs = self.suffixes.get((language, component), (None, None)) if not all_replacements and not prefixes and not suffixes: return name for regex, group, prob in all_replacements: match = regex.match(name) if match and random.random() < prob: name = match.group(group) for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0), (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)): if affixes is not None: regex = regexes[language, component] if regex.match(name): continue affix = weighted_choice(affixes, affix_probs) if affix is not None: whitespace = affix.get('whitespace', True) space_val = six.u(' ') if whitespace else six.u('') affix = affix[key] if direction == 0: return six.u('{}{}{}').format(affix, space_val, safe_decode(name)) else: return six.u('{}{}{}').format(safe_decode(name), space_val, affix) return name
def join(cls, phrases, language, country=None): if not hasattr(phrases, '__iter__'): raise ValueError('Param phrases must be iterable') values, probs = address_config.alternative_probabilities( cls.key, language, country=country) phrase, props = weighted_choice(values, probs) whitespace = props.get('whitespace', True) whitespace_phrase = six.u(' ') if whitespace else six.u('') phrases = [safe_decode(p) for p in phrases] max_phrase_join = props.get('max_phrase_join', 2) if len(phrases) > max_phrase_join: default_join = safe_decode( props.get( 'default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN)) prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')]) else: prefix = six.u('') if whitespace: phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase) joined_phrase = phrase.join(phrases[-max_phrase_join:]) return six.u('').join([prefix, joined_phrase])
def phrase(cls, unit, language, country=None, zone=None): if unit is not None: key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone) if not address_config.get_property(key, language, country=country): return None is_alpha = safe_decode(unit).isalpha() direction_unit = None add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country) if add_direction: direction_unit = cls.add_direction(key, unit, language, country=country) if direction_unit and direction_unit != unit: unit = direction_unit is_alpha = False else: add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country) if add_quadrant: unit = cls.add_quadrant(key, unit, language, country=country) is_alpha = False return cls.numeric_phrase(key, safe_decode(unit), language, dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha) else: key = 'units.standalone' values, probs = address_config.alternative_probabilities(key, language, dictionaries=['unit_types_standalone'], country=country) if values is None: return None phrase, phrase_props = weighted_choice(values, probs) return phrase.title()
def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country) if num_type is None: return None if num_type == cls.NUMERIC: number = weighted_choice(cls.block_range, cls.block_range_cdf) return safe_decode(number) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random() >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet, 2.0) if num_type == cls.ALPHA: return safe_decode(letter) else: number = weighted_choice(cls.block_range, cls.block_range_cdf) whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0)) whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('') if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter)
def join(cls, phrases, language, country=None): if not hasattr(phrases, '__iter__'): raise ValueError('Param phrases must be iterable') values, probs = address_config.alternative_probabilities(cls.key, language, country=country) phrase, props = weighted_choice(values, probs) whitespace = props.get('whitespace', True) whitespace_phrase = six.u(' ') if whitespace else six.u('') phrases = [safe_decode(p) for p in phrases] max_phrase_join = props.get('max_phrase_join', 2) if len(phrases) > max_phrase_join: default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN)) prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')]) else: prefix = six.u('') if whitespace: phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase) joined_phrase = phrase.join(phrases[-max_phrase_join:]) return six.u('').join([prefix, joined_phrase])
def latlon_to_decimal(latitude, longitude): have_lat = False have_lon = False latitude = safe_decode(latitude).strip(u' ,;|') longitude = safe_decode(longitude).strip(u' ,;|') latitude = latitude.replace(u',', u'.') longitude = longitude.replace(u',', u'.') lat_dms = latitude_dms_regex.match(latitude) lat_dir = latitude_decimal_with_direction_regex.match(latitude) if lat_dms: d, m, s, c = lat_dms.groups() sign = direction_sign(c) latitude = degrees_to_decimal(d or 0, m or 0, s or 0) have_lat = True elif lat_dir: d, c = lat_dir.groups() sign = direction_sign(c) latitude = return_type(d) * sign have_lat = True else: latitude = re.sub(beginning_re, u'', latitude) latitude = re.sub(end_re, u'', latitude) lon_dms = longitude_dms_regex.match(longitude) lon_dir = longitude_decimal_with_direction_regex.match(longitude) if lon_dms: d, m, s, c = lon_dms.groups() sign = direction_sign(c) longitude = degrees_to_decimal(d or 0, m or 0, s or 0) have_lon = True elif lon_dir: d, c = lon_dir.groups() sign = direction_sign(c) longitude = return_type(d) * sign have_lon = True else: longitude = re.sub(beginning_re, u'', longitude) longitude = re.sub(end_re, u'', longitude) latitude = float(latitude) longitude = float(longitude) if not is_valid_latitude(latitude): raise ValueError('Invalid latitude: {}'.format(latitude)) if not is_valid_longitude(longitude): raise ValueError('Invalid longitude: {}'.format(longitude)) latitude = to_valid_latitude(latitude) longitude = to_valid_longitude(longitude) return latitude, longitude
def phrase(cls, number, language, country=None): if number is None: return number key = 'conscription_numbers.alphanumeric' dictionaries = ['house_numbers'] default = safe_decode(number) return cls.numeric_phrase(key, safe_decode(number), language, dictionaries=dictionaries, country=country)
def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'staircases.alphanumeric', language, country=country) if num_type is None: return None if num_type == cls.NUMERIC: number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf) return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf) number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf) return u'{}-{}'.format(number, number2) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property( 'alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random( ) >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet, 2.0) if num_type == cls.ALPHA: return safe_decode(letter) else: number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf) whitespace_probability = float( num_type_props.get('whitespace_probability', 0.0)) hyphen_probability = float( num_type_props.get('hyphen_probability', 0.0)) whitespace_phrase = u'' r = random.random() if r < whitespace_probability: whitespace_phrase = u' ' elif r < (whitespace_probability + hyphen_probability): whitespace_phrase = u'-' if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter)
def create_from_osm_file(cls, filename, output_dir, precision=None): ''' Given an OSM file (planet or some other bounds) containing relations and their dependencies, create an R-tree index for coarse-grained reverse geocoding. Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the admin borders commands if using other bounds. ''' if precision is None: precision = cls.GEOHASH_PRECISION index = cls(save_dir=output_dir, precision=precision) i = 0 for element_id, props, deps in parse_osm(filename): props = { safe_decode(k): safe_decode(v) for k, v in six.iteritems(props) } node_id = long(element_id.split(':')[-1]) lat = props.get('lat') lon = props.get('lon') if lat is None or lon is None: continue lat, lon = latlon_to_decimal(lat, lon) if lat is None or lon is None: continue if isclose(lon, 180.0): lon = 179.999 props = { k: v for k, v in six.iteritems(props) if k in ('id', 'type') or k in cls.include_property_patterns or (six.u(':') in k and six.u('{}:*').format( k.split(six.u(':'), 1)[0]) in cls.include_property_patterns ) } props['type'] = 'node' props['id'] = node_id index.add_point(lat, lon, props) if i % 1000 == 0 and i > 0: print('did {} points'.format(i)) i += 1 return index
def parse_osm_number_range(value, parse_letter_range=True, max_range=100): value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE) numbers = [] values = number_split_regex.split(value) for val in values: val = val.strip() match = number_range_regex.match(val) if match: start_num, end_num = match.groups() start_num_len = len(start_num) zfill = 0 if start_num.startswith('0'): zfill = start_num_len try: start_num = int(start_num) end_num = int(end_num) if end_num > start_num: if end_num - start_num > max_range: end_num = start_num + max_range for i in xrange(start_num, end_num + 1): numbers.append(safe_decode(i).zfill(zfill)) else: numbers.append(val.strip().zfill(zfill)) continue except (TypeError, ValueError): numbers.append(safe_decode(val).strip().zfill(zfill)) continue else: letter_match = letter_range_regex.match(val) if letter_match and parse_letter_range: start_num, end_num = letter_match.groups() start_num = ord(start_num) end_num = ord(end_num) if end_num > start_num: if end_num - start_num > max_range: end_num = start_num + max_range for i in xrange(start_num, end_num + 1): numbers.append(six.unichr(i)) else: numbers.extend( [six.unichr(start_num), six.unichr(end_num)]) continue else: numbers.append(safe_decode(val.strip())) return numbers
def random_from_int(cls, number, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'levels.alphanumeric', language, country=country) if num_type is None: return None numbering_starts_at = int( address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)) if number >= 0: number += numbering_starts_at if num_type == cls.NUMERIC: return safe_decode(number) elif num_type == cls.ROMAN_NUMERAL: roman_numeral = numeric_expressions.roman_numeral(number) if roman_numeral is not None: return roman_numeral else: return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number2 = number + sample_floors_range(1, cls.max_floors) return u'{}-{}'.format(number, number2) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property( 'alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random( ) >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet) if num_type == cls.ALPHA: return letter else: number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf) if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}').format(letter, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}').format(number, letter) return None
def phrase(cls, unit, language, country=None, zone=None): if unit is not None: key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format( zone) if not address_config.get_property(key, language, country=country): return None is_alpha = safe_decode(unit).isalpha() direction_unit = None add_direction = address_config.get_property( '{}.add_direction'.format(key), language, country=country) if add_direction: direction_unit = cls.add_direction(key, unit, language, country=country) if direction_unit and direction_unit != unit: unit = direction_unit is_alpha = False else: add_quadrant = address_config.get_property( '{}.add_quadrant'.format(key), language, country=country) if add_quadrant: unit = cls.add_quadrant(key, unit, language, country=country) is_alpha = False return cls.numeric_phrase(key, safe_decode(unit), language, dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha) else: key = 'units.standalone' values, probs = address_config.alternative_probabilities( key, language, dictionaries=['unit_types_standalone'], country=country) if values is None: return None phrase, phrase_props = weighted_choice(values, probs) return phrase.title()
def parse_osm_number_range(value, parse_letter_range=True, max_range=100): value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE) numbers = [] values = number_split_regex.split(value) for val in values: val = val.strip() match = number_range_regex.match(val) if match: start_num, end_num = match.groups() start_num_len = len(start_num) zfill = 0 if start_num.startswith('0'): zfill = start_num_len try: start_num = int(start_num) end_num = int(end_num) if end_num > start_num: if end_num - start_num > max_range: end_num = start_num + max_range for i in xrange(start_num, end_num + 1): numbers.append(safe_decode(i).zfill(zfill)) else: numbers.append(val.strip().zfill(zfill)) continue except (TypeError, ValueError): numbers.append(safe_decode(val).strip().zfill(zfill)) continue else: letter_match = letter_range_regex.match(val) if letter_match and parse_letter_range: start_num, end_num = letter_match.groups() start_num = ord(start_num) end_num = ord(end_num) if end_num > start_num: if end_num - start_num > max_range: end_num = start_num + max_range for i in xrange(start_num, end_num + 1): numbers.append(six.unichr(i)) else: numbers.extend([six.unichr(start_num), six.unichr(end_num)]) continue else: numbers.append(safe_decode(val.strip())) return numbers
def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False): if num_type == cls.NUMERIC_AFFIX: phrase = props['affix'] if 'zero_pad' in props and number.isdigit(): number = number.rjust(props['zero_pad'], props.get('zero_char', '0')) direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) whitespace_probability = props.get('whitespace_probability') if whitespace_probability is not None: whitespace = random.random() < whitespace_probability if props.get('title_case', True): # Title case unless the config specifies otherwise phrase = phrase.title() if number is None: return phrase whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, number) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(number, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(number)
def for_floor(cls, floor_number, num_digits=None): num_digits = num_digits if num_digits is not None else cls.sample_num_digits( ) unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf) return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
def pick_phrase_and_type(cls, number, language, country=None): values, probs = address_config.alternative_probabilities( cls.key, language, dictionaries=cls.dictionaries, country=country) if not values: return None, safe_decode( number) if number is not None else None, None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX): key = '{}_probability'.format(num_type) prob = phrase_props.get(key, None) if prob is not None: values.append(num_type) probs.append(prob) if not probs: num_type = cls.NUMERIC else: probs = cdf(probs) num_type = weighted_choice(values, probs) return num_type, phrase, phrase_props[num_type]
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, token_options=DEFAULT_TOKEN_OPTIONS, strip_parentheticals=True, whitespace=False): ''' Normalizes a string, tokenizes, and normalizes each token with string and token-level options. This version only uses libpostal's deterministic normalizations i.e. methods with a single output. The string tree version will return multiple normalized strings, each with tokens. Usage: normalized_tokens(u'St.-Barthélemy') ''' s = safe_decode(s) normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace) if strip_parentheticals: normalized_tokens = remove_parens(normalized_tokens) return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]
def remove_components(self, template, tags): new_components = [] tags = set(tags) parsed = pystache.parse(safe_decode(template)) last_removed = False for i, el in enumerate(parsed._parse_tree): if hasattr(el, 'parsed'): keys = [ e.key for e in el.parsed._parse_tree if hasattr(e, 'key') and e.key not in tags ] if keys: new_components.append(self.build_first_of_template(keys)) last_removed = False else: last_removed = True elif hasattr(el, 'key'): if el.key not in tags: new_components.append( '{{{{{{{key}}}}}}}'.format(key=el.key)) last_removed = False else: last_removed = True elif not last_removed: new_components.append(el) else: last_removed = False return ''.join(new_components).strip()
def search_suffix(self, token): suffix_search, suffix_len = self.search_substring( safe_decode(token[::-1])) if suffix_search: return suffix_search[0].split('|') else: return None
def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE): self.lexicon = lexicon regexes, responses = zip(*lexicon) self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags) self.responses = responses
def remove_components(self, template, tags): new_components = [] tags = set(tags) parsed = pystache.parse(safe_decode(template)) last_removed = False for i, el in enumerate(parsed._parse_tree): if hasattr(el, 'parsed'): keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key') and e.key not in tags] if keys: new_components.append(self.build_first_of_template(keys)) last_removed = False else: last_removed = True elif hasattr(el, 'key'): if el.key not in tags: new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key)) last_removed = False else: last_removed = True elif not last_removed: new_components.append(el) else: last_removed = False return ''.join(new_components).strip()
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS): s = safe_decode(s) if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: normalized = _normalize.normalize_string_latin(s, string_options) else: normalized = _normalize.normalize_string_utf8(s, string_options) return normalized
def cldr_country_names(self, language): ''' Country names are tricky as there can be several versions and levels of verbosity e.g. United States of America vs. the more commonly used United States. Most countries have a similarly verbose form. The CLDR repo (http://cldr.unicode.org/) has the most comprehensive localized database of country names (among other things), organized by language. This function parses CLDR XML for a given language and returns a dictionary of {country_code: name} for that language. ''' filename = os.path.join(self.base_dir, '{}.xml'.format(language)) xml = etree.parse(open(filename)) country_names = defaultdict(dict) for territory in xml.xpath('*//territories/*'): country_code = territory.attrib['type'] if country_code in IGNORE_COUNTRIES or country_code.isdigit(): continue country_names[country_code][territory.attrib.get( 'alt')] = safe_decode(territory.text) display_names = {} for country_code, names in country_names.iteritems(): if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): display_names[country_code] = safe_decode( LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) continue default_name = names.get(None) if country_code in COUNTRY_USE_SHORT_NAME: display_names[country_code] = names.get('short', default_name) elif country_code in COUNTRY_USE_VARIANT_NAME: display_names[country_code] = names.get( 'variant', default_name) elif default_name is not None: display_names[country_code] = default_name return display_names
def phrase(cls, number, language, country=None): if number is not None: prob_key = 'house_numbers.alphanumeric_phrase_probability' key = 'house_numbers.alphanumeric' dictionaries = ['house_numbers', 'number'] default = safe_decode(number) else: prob_key = 'house_numbers.no_number_probability' key = 'house_numbers.no_number' dictionaries = ['no_number'] default = None phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0) if random.random() < phrase_prob: return cls.numeric_phrase(key, safe_decode(number), language, dictionaries=dictionaries, country=country) return default
def phrase(cls, box_number, language, country=None): if box_number is None: return None return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language, dictionaries=['post_office'], country=country)
def download_pre_release_downloads(out_dir): for url in openaddresses_config.config.get('pre_release_downloads', []): print(six.u('doing pre_release {}').format(safe_decode(url))) success = download_and_unzip_file(url, out_dir) if not success: print(six.u('ERR: could not download {}').format(source)) return False return True
def cldr_country_names(language, base_dir=CLDR_MAIN_PATH): """ Country names are tricky as there can be several versions and levels of verbosity e.g. United States of America vs. the more commonly used United States. Most countries have a similarly verbose form. The CLDR repo (http://cldr.unicode.org/) has the most comprehensive localized database of country names (among other things), organized by language. This function parses CLDR XML for a given language and returns a dictionary of {country_code: name} for that language. """ filename = os.path.join(base_dir, "{}.xml".format(language)) xml = etree.parse(open(filename)) country_names = defaultdict(dict) for territory in xml.xpath("*//territories/*"): country_code = territory.attrib["type"] if country_code in IGNORE_COUNTRIES or country_code.isdigit(): continue country_names[country_code][territory.attrib.get("alt")] = safe_decode(territory.text) display_names = {} for country_code, names in country_names.iteritems(): if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) continue default_name = names.get(None) if country_code in COUNTRY_USE_SHORT_NAME: display_names[country_code] = names.get("short", default_name) elif country_code in COUNTRY_USE_VARIANT_NAME: display_names[country_code] = names.get("variant", default_name) elif default_name is not None: display_names[country_code] = default_name return display_names
def normalize_wikipedia_title(title): match = apposition_regex.match(title) if match: title = match.group(1) title = safe_decode(title) title = html_parser.unescape(title) title = urllib.unquote_plus(title) return title.replace(u'_', u' ').strip()
def phrase(cls, language, key, value, is_plural=False, country=None): category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural) if not category_phrase: return NULL_CATEGORY_QUERY category_phrase = safe_decode(category_phrase) prep_phrase_type = CategoryPreposition.random(language, country=country) if prep_phrase_type in (None, CategoryPreposition.NULL): return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True) values, probs = address_config.alternative_probabilities( 'categories.{}'.format(prep_phrase_type), language, country=country) if not values: return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True) prep_phrase, prep_phrase_props = weighted_choice(values, probs) prep_phrase = safe_decode(prep_phrase) add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN) add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME) return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type( 'blocks.alphanumeric', language, country=country) if num_type is None: return None if num_type == cls.NUMERIC: number = weighted_choice(cls.block_range, cls.block_range_cdf) return safe_decode(number) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property( 'alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random( ) >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet, 2.0) if num_type == cls.ALPHA: return safe_decode(letter) else: number = weighted_choice(cls.block_range, cls.block_range_cdf) whitespace_probability = float( num_type_props.get('whitespace_probability', 0.0)) whitespace_phrase = six.u( ' ') if whitespace_probability and random.random( ) < whitespace_probability else six.u('') if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter)
def add_affixes(self, lang, *confs): prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])] prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])] self.language_prefixes[lang] = prefixes + prefixes_no_whitespace suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])] suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])] self.language_suffixes[lang] = suffixes + suffixes_no_whitespace whitespace_phrase = six.u('[ \-]') all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace if all_prefixes: prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) if all_suffixes: suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])] if sim_only_prefixes: sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes)) self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE) sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])] if sim_only_suffixes: sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes)) self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
def get_script_codes(all_scripts): if not os.path.exists(LOCAL_ISO_15924_FILE): temp_dir = tempfile.gettempdir() script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1]) # This comes as a .zip script_codes_response = requests.get(ISO_15924_URL) zf = ZipFile(StringIO(script_codes_response.content)) iso15924_filename = [ name for name in zf.namelist() if name.startswith('iso15924') ][0] # Strip out the comments, etc. temp_iso15924_file = u'\n'.join([ line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n') if line.strip() and not line.strip().startswith('#') ]) f = open(LOCAL_ISO_15924_FILE, 'w') f.write(safe_encode(temp_iso15924_file)) f.close() script_codes_file = open(LOCAL_ISO_15924_FILE) script_codes = {} seen_scripts = set() # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'): if name in all_scripts: script_codes[code] = name seen_scripts.add(name) else: normalized_name = name.split('(')[0].strip() if normalized_name in all_scripts and normalized_name not in seen_scripts: script_codes[code] = normalized_name seen_scripts.add(normalized_name) value_aliases = get_property_value_aliases() script_aliases = value_aliases['sc'] for code, script in script_aliases.iteritems(): if code not in script_codes and script in all_scripts: script_codes[code] = script script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL) return script_codes
def name(self, country, language, component, name): all_replacements = self.country_regex_replacements.get( country, []) + self.country_regex_replacements.get(None, []) prefixes, prefix_probs = self.prefixes.get((language, component), (None, None)) suffixes, suffix_probs = self.suffixes.get((language, component), (None, None)) if not all_replacements and not prefixes and not suffixes: return name for regex, group, prob in all_replacements: match = regex.match(name) if match and random.random() < prob: name = match.group(group) for affixes, affix_probs, regexes, key, direction in ( (prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0), (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)): if affixes is not None: regex = regexes[language, component] if regex.match(name): continue affix = weighted_choice(affixes, affix_probs) if affix is not None: whitespace = affix.get('whitespace', True) space_val = six.u(' ') if whitespace else six.u('') affix = affix[key] if direction == 0: return six.u('{}{}{}').format(affix, space_val, safe_decode(name)) else: return six.u('{}{}{}').format(safe_decode(name), space_val, affix) return name
def phrase(cls, chain, language, country=None): if not chain: return NULL_CHAIN_QUERY chain_phrase = safe_decode(chain) prep_phrase_type = CategoryPreposition.random(language, country=country) if prep_phrase_type in (None, CategoryPreposition.NULL): return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True) values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country) if not values: return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True) prep_phrase, prep_phrase_props = weighted_choice(values, probs) prep_phrase = safe_decode(prep_phrase) add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN) add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME) return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
def __init__(self, base_dir=STATE_DIR): self.full_names = {} self.abbreviations = {} for filename in os.listdir(base_dir): country = filename.split('.yaml')[0] country_config = yaml.load(open(os.path.join(base_dir, filename))) country_abbreviations = defaultdict(list) country_full_names = defaultdict(dict) for abbreviation, vals in six.iteritems(country_config): for language, full_name in six.iteritems(vals): full_name = safe_decode(full_name) abbreviation = safe_decode(abbreviation) country_abbreviations[(full_name.lower(), language)].append(abbreviation) country_full_names[ abbreviation.lower()][language] = full_name self.abbreviations[country] = dict(country_abbreviations) self.full_names[country] = dict(country_full_names)
def scan(self, s): for match in self.regex.finditer(safe_decode(s)): i = match.lastindex response = self.responses[i - 1] token = match.group(i) if not callable(response): yield (token, response) else: responses = response(match, token) if responses is not None: for response, token in responses: yield (token, response)
def random_from_int(cls, number, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country) if num_type is None: return None numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)) if number >= 0: number += numbering_starts_at if num_type == cls.NUMERIC: return safe_decode(number) elif num_type == cls.ROMAN_NUMERAL: roman_numeral = numeric_expressions.roman_numeral(number) if roman_numeral is not None: return roman_numeral else: return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number2 = number + sample_floors_range(1, cls.max_floors) return u'{}-{}'.format(number, number2) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random() >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet) if num_type == cls.ALPHA: return letter else: number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf) if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}').format(letter, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}').format(number, letter) return None
def phrase(cls, chain, language, country=None): if not chain: return NULL_CHAIN_QUERY chain_phrase = safe_decode(chain) prep_phrase_type = CategoryPreposition.random(language, country=country) if prep_phrase_type in (None, CategoryPreposition.NULL): return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True) values, probs = address_config.alternative_probabilities( 'categories.{}'.format(prep_phrase_type), language, country=country) if not values: return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True) prep_phrase, prep_phrase_props = weighted_choice(values, probs) prep_phrase = safe_decode(prep_phrase) add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN) add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME) return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
def read_dictionary_file(path): for i, line in enumerate(open(path)): line = safe_decode(line.rstrip()) if not line.strip(): continue if u'}' in line: raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1)) phrases = line.split(u'|') if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1)) yield phrases
def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True
def phrase(cls, language, key, value, is_plural=False, country=None): category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural) if not category_phrase: return NULL_CATEGORY_QUERY category_phrase = safe_decode(category_phrase) prep_phrase_type = CategoryPreposition.random(language, country=country) if prep_phrase_type in (None, CategoryPreposition.NULL): return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True) values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country) if not values: return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True) prep_phrase, prep_phrase_props = weighted_choice(values, probs) prep_phrase = safe_decode(prep_phrase) add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN) add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME) return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR): self.base_dir = base_dir self.languages = [] self.language_dictionaries = defaultdict(list) self.phrases = defaultdict(list) for language in os.listdir(base_dir): language_dir = os.path.join(base_dir, language) if not os.path.isdir(language_dir): continue self.languages.append(language) for filename in os.listdir(language_dir): if not filename.endswith('.txt'): raise InvalidAddressFileException( u'Invalid extension for file {}/{}, must be .txt'. format(language_dir, filename)) dictionary_name = filename.split('.')[0].lower() if dictionary_name not in gazetteer_types: raise InvalidAddressFileException( u'Invalid filename for file {}/{}. Must be one of {{{}}}' .format(language_dir, filename, ', '.join(sorted(gazetteer_types)))) self.language_dictionaries[language].append(dictionary_name) path = os.path.join(language_dir, filename) for i, line in enumerate(open(path)): line = safe_decode(line.rstrip()) if not line.strip(): continue if u'}' in line: raise InvalidAddressFileException( u'Found }} in file: {}, line {}'.format( path, i + 1)) phrases = line.split(u'|') if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: raise InvalidAddressFileException( u'Found blank synonym in: {}, line {}'.format( path, i + 1)) self.phrases[(language, dictionary_name)].append(phrases) self.language_dictionaries = dict(self.language_dictionaries) self.phrases = dict(self.phrases)
def scrape_nominatim_category_page(url, ignore_plurals=False): result = requests.get(url) if not result or not result.content: return for phrase, key, value, operator, plural in phrase_table_re.findall(result.content): if operator and operator != '-': continue is_plural = plural == 'Y' if is_plural and ignore_plurals: continue yield safe_decode(phrase).lower(), key, value, is_plural
def random(cls, language, country=None): num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country) if num_type is None: return None if num_type == cls.NUMERIC: number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf) return safe_decode(number) elif num_type == cls.HYPHENATED_NUMBER: number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf) number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf) return u'{}-{}'.format(number, number2) else: alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet) alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None) if alphabet_probability is not None and random.random() >= alphabet_probability: alphabet = latin_alphabet letter = sample_alphabet(alphabet, 2.0) if num_type == cls.ALPHA: return safe_decode(letter) else: number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf) whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0)) hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0)) whitespace_phrase = u'' r = random.random() if r < whitespace_probability: whitespace_phrase = u' ' elif r < (whitespace_probability + hyphen_probability): whitespace_phrase = u'-' if num_type == cls.ALPHA_PLUS_NUMERIC: return six.u('{}{}{}').format(letter, whitespace_phrase, number) elif num_type == cls.NUMERIC_PLUS_ALPHA: return six.u('{}{}{}').format(number, whitespace_phrase, letter)
def read_dictionary_file(path): for i, line in enumerate(open(path)): line = safe_decode(line.rstrip()) if not line.strip(): continue if u'}' in line: raise InvalidAddressFileException( u'Found }} in file: {}, line {}'.format(path, i + 1)) phrases = line.split(u'|') if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: raise InvalidAddressFileException( u'Found blank synonym in: {}, line {}'.format(path, i + 1)) yield phrases
def openaddresses_download_configured_files(out_dir): for path in openaddresses_config.sources: source = six.b('/').join([safe_encode(p) for p in path]) filename = safe_encode(path[-1]) + six.b('.zip') zip_path = filename + '.zip' zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)]) url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path) download_pre_release_downloads(out_dir) print(six.u('doing {}').format(safe_decode(source))) success = download_and_unzip_file(url, out_dir) if not success: print(six.u('ERR: could not download {}').format(source))
def scrape_nominatim_category_page(url, ignore_plurals=False): result = requests.get(url) if not result or not result.content: return for phrase, key, value, operator, plural in phrase_table_re.findall( result.content): if operator and operator != '-': continue is_plural = plural == 'Y' if is_plural and ignore_plurals: continue yield safe_decode(phrase).lower(), key, value, is_plural
def get_script_codes(all_scripts): if not os.path.exists(LOCAL_ISO_15924_FILE): temp_dir = tempfile.gettempdir() script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1]) # This comes as a .zip script_codes_response = requests.get(ISO_15924_URL) zf = ZipFile(StringIO(script_codes_response.content)) iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0] # Strip out the comments, etc. temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n') if line.strip() and not line.strip().startswith('#')]) f = open(LOCAL_ISO_15924_FILE, 'w') f.write(safe_encode(temp_iso15924_file)) f.close() script_codes_file = open(LOCAL_ISO_15924_FILE) script_codes = {} seen_scripts = set() # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'): if name in all_scripts: script_codes[code] = name seen_scripts.add(name) else: normalized_name = name.split('(')[0].strip() if normalized_name in all_scripts and normalized_name not in seen_scripts: script_codes[code] = normalized_name seen_scripts.add(normalized_name) value_aliases = get_property_value_aliases() script_aliases = value_aliases['sc'] for code, script in script_aliases.iteritems(): if code not in script_codes and script in all_scripts: script_codes[code] = script script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL) return script_codes
def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True
def replace_suffixes(self, name, lang, country=None, sim_only=False): name = safe_decode(name).strip() if not sim_only or lang not in self.language_suffix_sim_only_regexes: d = self.language_suffix_regexes else: d = self.language_suffix_sim_only_regexes re = None if country is not None: re = d.get((country, lang)) if re: name = re.sub(six.u(''), name) re = d.get(lang) if not re: return name return re.sub(six.u(''), name)
def cleanup_number(cls, num, strip_commas=False): num = num.strip() if strip_commas: num = num.replace(six.u(','), six.u('')) try: num_int = int(num) except (ValueError, TypeError): try: num_float = float(num) leading_zeros = 0 for c in num: if c == six.u('0'): leading_zeros += 1 else: break num = safe_decode(int(num_float)) if leading_zeros: num = six.u('{}{}').format(six.u('0') * leading_zeros, num) except (ValueError, TypeError): pass return num
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR): self.base_dir = base_dir self.languages = [] self.language_dictionaries = defaultdict(list) self.phrases = defaultdict(list) for language in os.listdir(base_dir): language_dir = os.path.join(base_dir, language) if not os.path.isdir(language_dir): continue self.languages.append(language) for filename in os.listdir(language_dir): if not filename.endswith('.txt'): raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename)) dictionary_name = filename.split('.')[0].lower() if dictionary_name not in gazetteer_types: raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types)))) self.language_dictionaries[language].append(dictionary_name) path = os.path.join(language_dir, filename) for i, line in enumerate(open(path)): line = safe_decode(line.rstrip()) if not line.strip(): continue if u'}' in line: raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1)) phrases = line.split(u'|') if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1)) self.phrases[(language, dictionary_name)].append(phrases) self.language_dictionaries = dict(self.language_dictionaries) self.phrases = dict(self.phrases)